From 240e79706f18d0092fa54698c2e16b7c2ddd127b Mon Sep 17 00:00:00 2001 From: Miguel Jacq Date: Sat, 20 Dec 2025 17:47:00 +1100 Subject: [PATCH] Allow the user to add extra paths to harvest, or paths to ignore, using `--exclude-path` and `--include-path` arguments. --- CHANGELOG.md | 5 + README.md | 26 ++++ enroll/cli.py | 78 +++++++++++- enroll/diff.py | 6 + enroll/harvest.py | 110 ++++++++++++++++ enroll/manifest.py | 115 +++++++++++++++++ enroll/pathfilter.py | 293 +++++++++++++++++++++++++++++++++++++++++++ enroll/remote.py | 21 +++- tests/test_cli.py | 45 ++++++- 9 files changed, 687 insertions(+), 12 deletions(-) create mode 100644 enroll/pathfilter.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e80a13..2d8d6e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# 0.1.3 + + * Allow the user to add extra paths to harvest, or paths to ignore, using `--exclude-path` and `--include-path` + arguments. + # 0.1.2 * Include files from `/usr/local/bin` and `/usr/local/etc` in harvest (assuming they aren't binaries or diff --git a/README.md b/README.md index 6645437..84a6965 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ Harvest state about a host and write a harvest bundle. - Changed-from-default config (plus related custom/unowned files under service dirs) - Non-system users + SSH public keys - Misc `/etc` that can’t be attributed to a package (`etc_custom` role) +- Optional user-specified extra files/dirs via `--include-path` (emitted as an `extra_paths` role at manifest time) **Common flags** - Remote harvesting: @@ -79,6 +80,14 @@ Harvest state about a host and write a harvest bundle. - `--dangerous`: disables secret-safety checks (see “Sensitive data” below) - Encrypt bundles at rest: - `--sops `: writes a single encrypted `harvest.tar.gz.sops` instead of a plaintext directory +- Path selection (include/exclude): + - `--include-path ` (repeatable): add extra files/dirs to harvest (even from locations normally ignored, like `/home`). Still subject to secret-safety checks unless `--dangerous`. + - `--exclude-path ` (repeatable): skip files/dirs even if they would normally be harvested. + - Pattern syntax: + - plain path: matches that file; directories match the directory + everything under it + - glob (default): supports `*` and `**` (prefix with `glob:` to force) + - regex: prefix with `re:` or `regex:` + - Precedence: excludes win over includes. --- @@ -227,6 +236,23 @@ enroll harvest --out /tmp/enroll-harvest enroll harvest --remote-host myhost.example.com --remote-user myuser --out /tmp/enroll-harvest ``` +### Include paths (`--include-path`) +```bash +# Add a few dotfiles from /home (still secret-safe unless --dangerous) +enroll harvest --out /tmp/enroll-harvest --include-path '/home/*/.bashrc' --include-path '/home/*/.profile' +``` + +### Exclude paths (`--exclude-path`) +```bash +# Skip specific /usr/local/bin entries (or patterns) +enroll harvest --out /tmp/enroll-harvest --exclude-path '/usr/local/bin/docker-*' --exclude-path '/usr/local/bin/some-tool' +``` + +### Regex include +```bash +enroll harvest --out /tmp/enroll-harvest --include-path 're:^/home/[^/]+/\.config/myapp/.*$' +``` + ### `--dangerous` ```bash enroll harvest --out /tmp/enroll-harvest --dangerous diff --git a/enroll/cli.py b/enroll/cli.py index 2d8ed5e..f6efe11 100644 --- a/enroll/cli.py +++ b/enroll/cli.py @@ -125,6 +125,27 @@ def main() -> None: action="store_true", help="Collect files more aggressively (may include secrets). Disables secret-avoidance checks.", ) + h.add_argument( + "--include-path", + action="append", + default=[], + metavar="PATTERN", + help=( + "Include extra file paths to harvest (repeatable). Supports globs (including '**') and regex via 're:'. " + "Included files are still filtered by IgnorePolicy unless --dangerous is used." + ), + ) + h.add_argument( + "--exclude-path", + action="append", + default=[], + metavar="PATTERN", + help=( + "Exclude file paths from harvesting (repeatable). Supports globs (including '**') and regex via 're:'. " + "Excludes apply to all harvesting, including defaults." + ), + ) + h.add_argument( "--sops", nargs="+", @@ -186,6 +207,27 @@ def main() -> None: action="store_true", help="Collect files more aggressively (may include secrets). Disables secret-avoidance checks.", ) + s.add_argument( + "--include-path", + action="append", + default=[], + metavar="PATTERN", + help=( + "Include extra file paths to harvest (repeatable). Supports globs (including '**') and regex via 're:'. " + "Included files are still filtered by IgnorePolicy unless --dangerous is used." + ), + ) + s.add_argument( + "--exclude-path", + action="append", + default=[], + metavar="PATTERN", + help=( + "Exclude file paths from harvesting (repeatable). Supports globs (including '**') and regex via 're:'. " + "Excludes apply to all harvesting, including defaults." + ), + ) + s.add_argument( "--sops", nargs="+", @@ -320,6 +362,8 @@ def main() -> None: remote_user=args.remote_user, dangerous=bool(args.dangerous), no_sudo=bool(args.no_sudo), + include_paths=list(getattr(args, "include_path", []) or []), + exclude_paths=list(getattr(args, "exclude_path", []) or []), ) _encrypt_harvest_dir_to_sops( tmp_bundle, out_file, list(sops_fps) @@ -338,6 +382,8 @@ def main() -> None: remote_user=args.remote_user, dangerous=bool(args.dangerous), no_sudo=bool(args.no_sudo), + include_paths=list(getattr(args, "include_path", []) or []), + exclude_paths=list(getattr(args, "exclude_path", []) or []), ) print(str(state)) else: @@ -350,7 +396,12 @@ def main() -> None: os.chmod(tmp_bundle, 0o700) except OSError: pass - harvest(str(tmp_bundle), dangerous=bool(args.dangerous)) + harvest( + str(tmp_bundle), + dangerous=bool(args.dangerous), + include_paths=list(getattr(args, "include_path", []) or []), + exclude_paths=list(getattr(args, "exclude_path", []) or []), + ) _encrypt_harvest_dir_to_sops( tmp_bundle, out_file, list(sops_fps) ) @@ -360,7 +411,12 @@ def main() -> None: raise SystemExit( "error: --out is required unless --remote-host is set" ) - path = harvest(args.out, dangerous=bool(args.dangerous)) + path = harvest( + args.out, + dangerous=bool(args.dangerous), + include_paths=list(getattr(args, "include_path", []) or []), + exclude_paths=list(getattr(args, "exclude_path", []) or []), + ) print(path) elif args.cmd == "manifest": out_enc = manifest( @@ -446,6 +502,8 @@ def main() -> None: remote_user=args.remote_user, dangerous=bool(args.dangerous), no_sudo=bool(args.no_sudo), + include_paths=list(getattr(args, "include_path", []) or []), + exclude_paths=list(getattr(args, "exclude_path", []) or []), ) _encrypt_harvest_dir_to_sops( tmp_bundle, out_file, list(sops_fps) @@ -473,6 +531,8 @@ def main() -> None: remote_user=args.remote_user, dangerous=bool(args.dangerous), no_sudo=bool(args.no_sudo), + include_paths=list(getattr(args, "include_path", []) or []), + exclude_paths=list(getattr(args, "exclude_path", []) or []), ) manifest( str(harvest_dir), @@ -493,7 +553,12 @@ def main() -> None: os.chmod(tmp_bundle, 0o700) except OSError: pass - harvest(str(tmp_bundle), dangerous=bool(args.dangerous)) + harvest( + str(tmp_bundle), + dangerous=bool(args.dangerous), + include_paths=list(getattr(args, "include_path", []) or []), + exclude_paths=list(getattr(args, "exclude_path", []) or []), + ) _encrypt_harvest_dir_to_sops( tmp_bundle, out_file, list(sops_fps) ) @@ -512,7 +577,12 @@ def main() -> None: raise SystemExit( "error: --harvest is required unless --remote-host is set" ) - harvest(args.harvest, dangerous=bool(args.dangerous)) + harvest( + args.harvest, + dangerous=bool(args.dangerous), + include_paths=list(getattr(args, "include_path", []) or []), + exclude_paths=list(getattr(args, "exclude_path", []) or []), + ) manifest( args.harvest, args.out, diff --git a/enroll/diff.py b/enroll/diff.py index e2861c9..a2b7d91 100644 --- a/enroll/diff.py +++ b/enroll/diff.py @@ -196,6 +196,12 @@ def _iter_managed_files(state: Dict[str, Any]) -> Iterable[Tuple[str, Dict[str, for mf in ul.get("managed_files", []) or []: yield str(ul_role), mf + # extra_paths + xp = state.get("extra_paths") or {} + xp_role = xp.get("role_name") or "extra_paths" + for mf in xp.get("managed_files", []) or []: + yield str(xp_role), mf + def _file_index(bundle_dir: Path, state: Dict[str, Any]) -> Dict[str, FileRec]: """Return mapping of absolute path -> FileRec. diff --git a/enroll/harvest.py b/enroll/harvest.py index 659bebc..48242d6 100644 --- a/enroll/harvest.py +++ b/enroll/harvest.py @@ -19,6 +19,7 @@ from .debian import ( stat_triplet, ) from .ignore import IgnorePolicy +from .pathfilter import PathFilter, expand_includes from .accounts import collect_non_system_users @@ -86,6 +87,16 @@ class UsrLocalCustomSnapshot: notes: List[str] +@dataclass +class ExtraPathsSnapshot: + role_name: str + include_patterns: List[str] + exclude_patterns: List[str] + managed_files: List[ManagedFile] + excluded: List[ExcludedFile] + notes: List[str] + + ALLOWED_UNOWNED_EXTS = { ".conf", ".cfg", @@ -250,6 +261,8 @@ def harvest( policy: Optional[IgnorePolicy] = None, *, dangerous: bool = False, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, ) -> str: # If a policy is not supplied, build one. `--dangerous` relaxes secret # detection and deny-glob skipping. @@ -261,6 +274,10 @@ def harvest( policy.dangerous = True os.makedirs(bundle_dir, exist_ok=True) + # User-provided includes/excludes. Excludes apply to all harvesting; + # includes are harvested into an extra role. + path_filter = PathFilter(include=include_paths or (), exclude=exclude_paths or ()) + if hasattr(os, "geteuid") and os.geteuid() != 0: print( "Warning: not running as root; harvest may miss files or metadata.", @@ -406,6 +423,9 @@ def harvest( ) for path, reason in sorted(candidates.items()): + if path_filter.is_excluded(path): + excluded.append(ExcludedFile(path=path, reason="user_excluded")) + continue deny = policy.deny_reason(path) if deny: excluded.append(ExcludedFile(path=path, reason=deny)) @@ -522,6 +542,9 @@ def harvest( candidates.setdefault(r, "custom_specific_path") for path, reason in sorted(candidates.items()): + if path_filter.is_excluded(path): + excluded.append(ExcludedFile(path=path, reason="user_excluded")) + continue deny = policy.deny_reason(path) if deny: excluded.append(ExcludedFile(path=path, reason=deny)) @@ -593,6 +616,9 @@ def harvest( # Copy only safe SSH public material: authorized_keys + *.pub for sf in u.ssh_files: + if path_filter.is_excluded(sf): + users_excluded.append(ExcludedFile(path=sf, reason="user_excluded")) + continue deny = policy.deny_reason(sf) if deny: users_excluded.append(ExcludedFile(path=sf, reason=deny)) @@ -665,6 +691,10 @@ def harvest( if not _is_confish(path): continue + if path_filter.is_excluded(path): + etc_excluded.append(ExcludedFile(path=path, reason="user_excluded")) + continue + deny = policy.deny_reason(path) if deny: etc_excluded.append(ExcludedFile(path=path, reason=deny)) @@ -754,6 +784,10 @@ def harvest( ul_excluded.append(ExcludedFile(path=path, reason="unreadable")) continue + if path_filter.is_excluded(path): + ul_excluded.append(ExcludedFile(path=path, reason="user_excluded")) + continue + deny = policy.deny_reason(path) if deny: ul_excluded.append(ExcludedFile(path=path, reason=deny)) @@ -806,6 +840,81 @@ def harvest( notes=ul_notes, ) + # ------------------------- + # extra_paths role (user-requested includes) + # ------------------------- + extra_notes: List[str] = [] + extra_excluded: List[ExcludedFile] = [] + extra_managed: List[ManagedFile] = [] + extra_role_name = "extra_paths" + + include_specs = list(include_paths or []) + exclude_specs = list(exclude_paths or []) + + if include_specs: + extra_notes.append("User include patterns:") + extra_notes.extend([f"- {p}" for p in include_specs]) + if exclude_specs: + extra_notes.append("User exclude patterns:") + extra_notes.extend([f"- {p}" for p in exclude_specs]) + + included_files: List[str] = [] + if include_specs: + files, inc_notes = expand_includes( + path_filter.iter_include_patterns(), + exclude=path_filter, + max_files=4000, + ) + included_files = files + extra_notes.extend(inc_notes) + + for path in included_files: + if path in already_all: + continue + + if path_filter.is_excluded(path): + extra_excluded.append(ExcludedFile(path=path, reason="user_excluded")) + continue + + deny = policy.deny_reason(path) + if deny: + extra_excluded.append(ExcludedFile(path=path, reason=deny)) + continue + + try: + owner, group, mode = stat_triplet(path) + except OSError: + extra_excluded.append(ExcludedFile(path=path, reason="unreadable")) + continue + + src_rel = path.lstrip("/") + try: + _copy_into_bundle(bundle_dir, extra_role_name, path, src_rel) + except OSError: + extra_excluded.append(ExcludedFile(path=path, reason="unreadable")) + continue + + extra_managed.append( + ManagedFile( + path=path, + src_rel=src_rel, + owner=owner, + group=group, + mode=mode, + reason="user_include", + ) + ) + already_all.add(path) + + extra_paths_snapshot = ExtraPathsSnapshot( + role_name=extra_role_name, + include_patterns=include_specs, + exclude_patterns=exclude_specs, + managed_files=extra_managed, + excluded=extra_excluded, + notes=extra_notes, + ) + state = { "host": {"hostname": os.uname().nodename, "os": "debian"}, "users": asdict(users_snapshot), @@ -815,6 +924,7 @@ def harvest( "package_roles": [asdict(p) for p in pkg_snaps], "etc_custom": asdict(etc_custom_snapshot), "usr_local_custom": asdict(usr_local_custom_snapshot), + "extra_paths": asdict(extra_paths_snapshot), } state_path = os.path.join(bundle_dir, "state.json") diff --git a/enroll/manifest.py b/enroll/manifest.py index 6909c5c..2f28eab 100644 --- a/enroll/manifest.py +++ b/enroll/manifest.py @@ -630,6 +630,7 @@ def _manifest_from_bundle_dir( users_snapshot: Dict[str, Any] = state.get("users", {}) etc_custom_snapshot: Dict[str, Any] = state.get("etc_custom", {}) usr_local_custom_snapshot: Dict[str, Any] = state.get("usr_local_custom", {}) + extra_paths_snapshot: Dict[str, Any] = state.get("extra_paths", {}) site_mode = fqdn is not None and fqdn != "" @@ -663,6 +664,7 @@ def _manifest_from_bundle_dir( manifested_users_roles: List[str] = [] manifested_etc_custom_roles: List[str] = [] manifested_usr_local_custom_roles: List[str] = [] + manifested_extra_paths_roles: List[str] = [] manifested_service_roles: List[str] = [] manifested_pkg_roles: List[str] = [] @@ -1098,6 +1100,118 @@ Unowned /etc config files not attributed to packages or services. manifested_usr_local_custom_roles.append(role) + # ------------------------- + # extra_paths role (user-requested includes) + # ------------------------- + if extra_paths_snapshot and extra_paths_snapshot.get("managed_files"): + role = extra_paths_snapshot.get("role_name", "extra_paths") + role_dir = os.path.join(roles_root, role) + _write_role_scaffold(role_dir) + + var_prefix = role + + managed_files = extra_paths_snapshot.get("managed_files", []) + excluded = extra_paths_snapshot.get("excluded", []) + notes = extra_paths_snapshot.get("notes", []) + include_pats = extra_paths_snapshot.get("include_patterns", []) or [] + exclude_pats = extra_paths_snapshot.get("exclude_patterns", []) or [] + + templated, jt_vars = _jinjify_managed_files( + bundle_dir, + role, + role_dir, + managed_files, + jt_exe=jt_exe, + jt_enabled=jt_enabled, + overwrite_templates=not site_mode, + ) + + if site_mode: + _copy_artifacts( + bundle_dir, + role, + _host_role_files_dir(out_dir, fqdn or "", role), + exclude_rels=templated, + ) + else: + _copy_artifacts( + bundle_dir, + role, + os.path.join(role_dir, "files"), + exclude_rels=templated, + ) + + files_var = _build_managed_files_var( + managed_files, + templated, + notify_other=None, + notify_systemd=None, + ) + + jt_map = _yaml_load_mapping(jt_vars) if jt_vars.strip() else {} + vars_map: Dict[str, Any] = {f"{var_prefix}_managed_files": files_var} + vars_map = _merge_mappings_overwrite(vars_map, jt_map) + + if site_mode: + _write_role_defaults(role_dir, {f"{var_prefix}_managed_files": []}) + _write_hostvars(out_dir, fqdn or "", role, vars_map) + else: + _write_role_defaults(role_dir, vars_map) + + tasks = "---\n" + _render_generic_files_tasks( + var_prefix, include_restart_notify=False + ) + with open( + os.path.join(role_dir, "tasks", "main.yml"), "w", encoding="utf-8" + ) as f: + f.write(tasks.rstrip() + "\n") + + with open( + os.path.join(role_dir, "handlers", "main.yml"), "w", encoding="utf-8" + ) as f: + f.write("---\n") + + with open( + os.path.join(role_dir, "meta", "main.yml"), "w", encoding="utf-8" + ) as f: + f.write("---\ndependencies: []\n") + + readme = ( + f"""# {role} + +User-requested extra file harvesting. + +## Include patterns +""" + + ("\n".join([f"- {p}" for p in include_pats]) or "- (none)") + + """\n +## Exclude patterns +""" + + ("\n".join([f"- {p}" for p in exclude_pats]) or "- (none)") + + """\n +## Managed files +""" + + ("\n".join([f"- {mf.get('path')}" for mf in managed_files]) or "- (none)") + + """\n +## Excluded +""" + + ( + "\n".join([f"- {e.get('path')} ({e.get('reason')})" for e in excluded]) + or "- (none)" + ) + + """\n +## Notes +""" + + ("\n".join([f"- {n}" for n in notes]) or "- (none)") + + """\n""" + ) + with open(os.path.join(role_dir, "README.md"), "w", encoding="utf-8") as f: + f.write(readme) + + manifested_extra_paths_roles.append(role) + + manifested_usr_local_custom_roles.append(role) + # ------------------------- # ------------------------- @@ -1412,6 +1526,7 @@ Generated for package `{pkg}`. + manifested_service_roles + manifested_etc_custom_roles + manifested_usr_local_custom_roles + + manifested_extra_paths_roles + manifested_users_roles ) diff --git a/enroll/pathfilter.py b/enroll/pathfilter.py new file mode 100644 index 0000000..9df4afa --- /dev/null +++ b/enroll/pathfilter.py @@ -0,0 +1,293 @@ +from __future__ import annotations + +import glob +import os +import re +from dataclasses import dataclass +from pathlib import PurePosixPath +from typing import List, Optional, Sequence, Set, Tuple + + +_REGEX_PREFIXES = ("re:", "regex:") + + +def _has_glob_chars(s: str) -> bool: + return any(ch in s for ch in "*?[") + + +def _norm_abs(p: str) -> str: + """Normalise a path-ish string to an absolute POSIX path. + + We treat inputs that don't start with '/' as being relative to '/'. + """ + + p = p.strip() + if not p: + return "/" + if not p.startswith("/"): + p = "/" + p + # `normpath` keeps a leading '/' for absolute paths. + return os.path.normpath(p) + + +def _posix_match(path: str, pattern: str) -> bool: + """Path matching with glob semantics. + + Uses PurePosixPath.match which: + - treats '/' as a segment separator + - supports '**' for recursive matching + + Both `path` and `pattern` are treated as absolute paths. + """ + + # PurePosixPath.match is anchored and works best on relative strings. + p = path.lstrip("/") + pat = pattern.lstrip("/") + try: + return PurePosixPath(p).match(pat) + except Exception: + # If the pattern is somehow invalid, fail closed. + return False + + +def _regex_literal_prefix(regex: str) -> str: + """Best-effort literal prefix extraction for a regex. + + This lets us pick a starting directory to walk when expanding regex-based + include patterns. + """ + + s = regex + if s.startswith("^"): + s = s[1:] + out: List[str] = [] + escaped = False + meta = set(".^$*+?{}[]\\|()") + for ch in s: + if escaped: + out.append(ch) + escaped = False + continue + if ch == "\\": + escaped = True + continue + if ch in meta: + break + out.append(ch) + return "".join(out) + + +@dataclass(frozen=True) +class CompiledPathPattern: + raw: str + kind: str # 'prefix' | 'glob' | 'regex' + value: str + regex: Optional[re.Pattern[str]] = None + + def matches(self, path: str) -> bool: + p = _norm_abs(path) + + if self.kind == "regex": + if not self.regex: + return False + # Search (not match) so users can write unanchored patterns. + return self.regex.search(p) is not None + + if self.kind == "glob": + return _posix_match(p, self.value) + + # prefix + pref = self.value.rstrip("/") + return p == pref or p.startswith(pref + "/") + + +def compile_path_pattern(raw: str) -> CompiledPathPattern: + s = raw.strip() + for pre in _REGEX_PREFIXES: + if s.startswith(pre): + rex = s[len(pre) :].strip() + try: + return CompiledPathPattern( + raw=raw, kind="regex", value=rex, regex=re.compile(rex) + ) + except re.error: + # Treat invalid regexes as non-matching. + return CompiledPathPattern(raw=raw, kind="regex", value=rex, regex=None) + + # If the user explicitly says glob:, honour it. + if s.startswith("glob:"): + pat = s[len("glob:") :].strip() + return CompiledPathPattern(raw=raw, kind="glob", value=_norm_abs(pat)) + + # Heuristic: if it contains glob metacharacters, treat as a glob. + if _has_glob_chars(s) or "**" in s: + return CompiledPathPattern(raw=raw, kind="glob", value=_norm_abs(s)) + + # Otherwise treat as an exact path-or-prefix (dir subtree). + return CompiledPathPattern(raw=raw, kind="prefix", value=_norm_abs(s)) + + +@dataclass +class PathFilter: + """User-provided path filters. + + Semantics: + - exclude patterns always win + - include patterns are used only to expand *additional* files to harvest + (they do not restrict the default harvest set) + + Patterns: + - By default: glob-like (supports '**') + - Regex: prefix with 're:' or 'regex:' + - Force glob: prefix with 'glob:' + - A plain path without wildcards matches that path and everything under it + (directory-prefix behavior). + + Examples: + --exclude-path /usr/local/bin/docker-* + --include-path /home/*/.bashrc + --include-path 're:^/home/[^/]+/.config/myapp/.*$' + """ + + include: Sequence[str] = () + exclude: Sequence[str] = () + + def __post_init__(self) -> None: + self._include = [ + compile_path_pattern(p) for p in self.include if str(p).strip() + ] + self._exclude = [ + compile_path_pattern(p) for p in self.exclude if str(p).strip() + ] + + def is_excluded(self, path: str) -> bool: + for pat in self._exclude: + if pat.matches(path): + return True + return False + + def iter_include_patterns(self) -> List[CompiledPathPattern]: + return list(self._include) + + +def expand_includes( + patterns: Sequence[CompiledPathPattern], + *, + exclude: Optional[PathFilter] = None, + max_files: int = 4000, +) -> Tuple[List[str], List[str]]: + """Expand include patterns into concrete file paths. + + Returns (paths, notes). The returned paths are absolute paths. + + This function is intentionally conservative: + - symlinks are ignored (both dirs and files) + - the number of collected files is capped + + Regex patterns are expanded by walking a best-effort inferred root. + """ + + out: List[str] = [] + notes: List[str] = [] + seen: Set[str] = set() + + def _maybe_add_file(p: str) -> None: + if len(out) >= max_files: + return + p = _norm_abs(p) + if exclude and exclude.is_excluded(p): + return + if p in seen: + return + if not os.path.isfile(p) or os.path.islink(p): + return + seen.add(p) + out.append(p) + + def _walk_dir(root: str, match: Optional[CompiledPathPattern] = None) -> None: + root = _norm_abs(root) + if not os.path.isdir(root) or os.path.islink(root): + return + for dirpath, dirnames, filenames in os.walk(root, followlinks=False): + # Prune excluded directories early. + if exclude: + dirnames[:] = [ + d + for d in dirnames + if not exclude.is_excluded(os.path.join(dirpath, d)) + and not os.path.islink(os.path.join(dirpath, d)) + ] + for fn in filenames: + if len(out) >= max_files: + return + p = os.path.join(dirpath, fn) + if os.path.islink(p) or not os.path.isfile(p): + continue + if exclude and exclude.is_excluded(p): + continue + if match is not None and not match.matches(p): + continue + if p in seen: + continue + seen.add(p) + out.append(_norm_abs(p)) + + for pat in patterns: + if len(out) >= max_files: + notes.append( + f"Include cap reached ({max_files}); some includes were not expanded." + ) + break + + matched_any = False + + if pat.kind == "prefix": + p = pat.value + if os.path.isfile(p) and not os.path.islink(p): + _maybe_add_file(p) + matched_any = True + elif os.path.isdir(p) and not os.path.islink(p): + before = len(out) + _walk_dir(p) + matched_any = len(out) > before + else: + # Still allow prefix patterns that don't exist now (e.g. remote different) + # by matching nothing rather than erroring. + matched_any = False + + elif pat.kind == "glob": + # Use glob for expansion; also walk directories that match. + gpat = pat.value + hits = glob.glob(gpat, recursive=True) + for h in hits: + if len(out) >= max_files: + break + h = _norm_abs(h) + if exclude and exclude.is_excluded(h): + continue + if os.path.isdir(h) and not os.path.islink(h): + before = len(out) + _walk_dir(h) + if len(out) > before: + matched_any = True + elif os.path.isfile(h) and not os.path.islink(h): + _maybe_add_file(h) + matched_any = True + + else: # regex + rex = pat.value + prefix = _regex_literal_prefix(rex) + # Determine a walk root. If we can infer an absolute prefix, use its + # directory; otherwise fall back to '/'. + if prefix.startswith("/"): + root = os.path.dirname(prefix) or "/" + else: + root = "/" + before = len(out) + _walk_dir(root, match=pat) + matched_any = len(out) > before + + if not matched_any: + notes.append(f"Include pattern matched no files: {pat.raw!r}") + + return out, notes diff --git a/enroll/remote.py b/enroll/remote.py index 469248d..9618512 100644 --- a/enroll/remote.py +++ b/enroll/remote.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import shlex import shutil import tarfile import tempfile @@ -97,6 +98,8 @@ def remote_harvest( remote_python: str = "python3", dangerous: bool = False, no_sudo: bool = False, + include_paths: Optional[list[str]] = None, + exclude_paths: Optional[list[str]] = None, ) -> Path: """Run enroll harvest on a remote host via SSH and pull the bundle locally. @@ -165,13 +168,25 @@ def remote_harvest( sftp.put(str(pyz), rapp) # Run remote harvest. - _cmd = f"{remote_python} {rapp} harvest --out {rbundle}" + argv: list[str] = [ + remote_python, + rapp, + "harvest", + "--out", + rbundle, + ] + if dangerous: + argv.append("--dangerous") + for p in include_paths or []: + argv.extend(["--include-path", str(p)]) + for p in exclude_paths or []: + argv.extend(["--exclude-path", str(p)]) + + _cmd = " ".join(shlex.quote(a) for a in argv) if not no_sudo: cmd = f"sudo {_cmd}" else: cmd = _cmd - if dangerous: - cmd += " --dangerous" rc, out, err = _ssh_run(ssh, cmd) if rc != 0: raise RuntimeError( diff --git a/tests/test_cli.py b/tests/test_cli.py index ca3bfa6..4477b24 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,9 +6,17 @@ import enroll.cli as cli def test_cli_harvest_subcommand_calls_harvest(monkeypatch, capsys, tmp_path): called = {} - def fake_harvest(out: str, dangerous: bool = False): + def fake_harvest( + out: str, + dangerous: bool = False, + include_paths=None, + exclude_paths=None, + **_kwargs, + ): called["out"] = out called["dangerous"] = dangerous + called["include_paths"] = include_paths or [] + called["exclude_paths"] = exclude_paths or [] return str(tmp_path / "state.json") monkeypatch.setattr(cli, "harvest", fake_harvest) @@ -17,6 +25,8 @@ def test_cli_harvest_subcommand_calls_harvest(monkeypatch, capsys, tmp_path): cli.main() assert called["out"] == str(tmp_path) assert called["dangerous"] is False + assert called["include_paths"] == [] + assert called["exclude_paths"] == [] captured = capsys.readouterr() assert str(tmp_path / "state.json") in captured.out @@ -55,8 +65,16 @@ def test_cli_manifest_subcommand_calls_manifest(monkeypatch, tmp_path): def test_cli_enroll_subcommand_runs_harvest_then_manifest(monkeypatch, tmp_path): calls = [] - def fake_harvest(bundle_dir: str, dangerous: bool = False): - calls.append(("harvest", bundle_dir, dangerous)) + def fake_harvest( + bundle_dir: str, + dangerous: bool = False, + include_paths=None, + exclude_paths=None, + **_kwargs, + ): + calls.append( + ("harvest", bundle_dir, dangerous, include_paths or [], exclude_paths or []) + ) return str(tmp_path / "bundle" / "state.json") def fake_manifest(bundle_dir: str, out_dir: str, **kwargs): @@ -87,7 +105,7 @@ def test_cli_enroll_subcommand_runs_harvest_then_manifest(monkeypatch, tmp_path) cli.main() assert calls == [ - ("harvest", str(tmp_path / "bundle"), False), + ("harvest", str(tmp_path / "bundle"), False, [], []), ("manifest", str(tmp_path / "bundle"), str(tmp_path / "ansible"), None, "auto"), ] @@ -95,9 +113,17 @@ def test_cli_enroll_subcommand_runs_harvest_then_manifest(monkeypatch, tmp_path) def test_cli_harvest_dangerous_flag_is_forwarded(monkeypatch, tmp_path): called = {} - def fake_harvest(out: str, dangerous: bool = False): + def fake_harvest( + out: str, + dangerous: bool = False, + include_paths=None, + exclude_paths=None, + **_kwargs, + ): called["out"] = out called["dangerous"] = dangerous + called["include_paths"] = include_paths or [] + called["exclude_paths"] = exclude_paths or [] return str(tmp_path / "state.json") monkeypatch.setattr(cli, "harvest", fake_harvest) @@ -107,6 +133,8 @@ def test_cli_harvest_dangerous_flag_is_forwarded(monkeypatch, tmp_path): cli.main() assert called["dangerous"] is True + assert called["include_paths"] == [] + assert called["exclude_paths"] == [] def test_cli_harvest_remote_calls_remote_harvest_and_uses_cache_dir( @@ -131,6 +159,9 @@ def test_cli_harvest_remote_calls_remote_harvest_and_uses_cache_dir( remote_user, dangerous, no_sudo, + include_paths=None, + exclude_paths=None, + **_kwargs, ): called.update( { @@ -140,6 +171,8 @@ def test_cli_harvest_remote_calls_remote_harvest_and_uses_cache_dir( "remote_user": remote_user, "dangerous": dangerous, "no_sudo": no_sudo, + "include_paths": include_paths or [], + "exclude_paths": exclude_paths or [], } ) return cache_dir / "state.json" @@ -169,6 +202,8 @@ def test_cli_harvest_remote_calls_remote_harvest_and_uses_cache_dir( assert called["remote_user"] == "alice" assert called["dangerous"] is False assert called["no_sudo"] is False + assert called["include_paths"] == [] + assert called["exclude_paths"] == [] def test_cli_single_shot_remote_without_harvest_prints_state_path(