Allow the user to add extra paths to harvest, or
All checks were successful
CI / test (push) Successful in 5m31s
Lint / test (push) Successful in 34s
Trivy / test (push) Successful in 19s

paths to ignore, using `--exclude-path` and
`--include-path` arguments.
This commit is contained in:
Miguel Jacq 2025-12-20 17:47:00 +11:00
parent 25add369dc
commit 240e79706f
Signed by: mig5
GPG key ID: 59B3F0C24135C6A9
9 changed files with 687 additions and 12 deletions

View file

@ -125,6 +125,27 @@ def main() -> None:
action="store_true",
help="Collect files more aggressively (may include secrets). Disables secret-avoidance checks.",
)
h.add_argument(
"--include-path",
action="append",
default=[],
metavar="PATTERN",
help=(
"Include extra file paths to harvest (repeatable). Supports globs (including '**') and regex via 're:<regex>'. "
"Included files are still filtered by IgnorePolicy unless --dangerous is used."
),
)
h.add_argument(
"--exclude-path",
action="append",
default=[],
metavar="PATTERN",
help=(
"Exclude file paths from harvesting (repeatable). Supports globs (including '**') and regex via 're:<regex>'. "
"Excludes apply to all harvesting, including defaults."
),
)
h.add_argument(
"--sops",
nargs="+",
@ -186,6 +207,27 @@ def main() -> None:
action="store_true",
help="Collect files more aggressively (may include secrets). Disables secret-avoidance checks.",
)
s.add_argument(
"--include-path",
action="append",
default=[],
metavar="PATTERN",
help=(
"Include extra file paths to harvest (repeatable). Supports globs (including '**') and regex via 're:<regex>'. "
"Included files are still filtered by IgnorePolicy unless --dangerous is used."
),
)
s.add_argument(
"--exclude-path",
action="append",
default=[],
metavar="PATTERN",
help=(
"Exclude file paths from harvesting (repeatable). Supports globs (including '**') and regex via 're:<regex>'. "
"Excludes apply to all harvesting, including defaults."
),
)
s.add_argument(
"--sops",
nargs="+",
@ -320,6 +362,8 @@ def main() -> None:
remote_user=args.remote_user,
dangerous=bool(args.dangerous),
no_sudo=bool(args.no_sudo),
include_paths=list(getattr(args, "include_path", []) or []),
exclude_paths=list(getattr(args, "exclude_path", []) or []),
)
_encrypt_harvest_dir_to_sops(
tmp_bundle, out_file, list(sops_fps)
@ -338,6 +382,8 @@ def main() -> None:
remote_user=args.remote_user,
dangerous=bool(args.dangerous),
no_sudo=bool(args.no_sudo),
include_paths=list(getattr(args, "include_path", []) or []),
exclude_paths=list(getattr(args, "exclude_path", []) or []),
)
print(str(state))
else:
@ -350,7 +396,12 @@ def main() -> None:
os.chmod(tmp_bundle, 0o700)
except OSError:
pass
harvest(str(tmp_bundle), dangerous=bool(args.dangerous))
harvest(
str(tmp_bundle),
dangerous=bool(args.dangerous),
include_paths=list(getattr(args, "include_path", []) or []),
exclude_paths=list(getattr(args, "exclude_path", []) or []),
)
_encrypt_harvest_dir_to_sops(
tmp_bundle, out_file, list(sops_fps)
)
@ -360,7 +411,12 @@ def main() -> None:
raise SystemExit(
"error: --out is required unless --remote-host is set"
)
path = harvest(args.out, dangerous=bool(args.dangerous))
path = harvest(
args.out,
dangerous=bool(args.dangerous),
include_paths=list(getattr(args, "include_path", []) or []),
exclude_paths=list(getattr(args, "exclude_path", []) or []),
)
print(path)
elif args.cmd == "manifest":
out_enc = manifest(
@ -446,6 +502,8 @@ def main() -> None:
remote_user=args.remote_user,
dangerous=bool(args.dangerous),
no_sudo=bool(args.no_sudo),
include_paths=list(getattr(args, "include_path", []) or []),
exclude_paths=list(getattr(args, "exclude_path", []) or []),
)
_encrypt_harvest_dir_to_sops(
tmp_bundle, out_file, list(sops_fps)
@ -473,6 +531,8 @@ def main() -> None:
remote_user=args.remote_user,
dangerous=bool(args.dangerous),
no_sudo=bool(args.no_sudo),
include_paths=list(getattr(args, "include_path", []) or []),
exclude_paths=list(getattr(args, "exclude_path", []) or []),
)
manifest(
str(harvest_dir),
@ -493,7 +553,12 @@ def main() -> None:
os.chmod(tmp_bundle, 0o700)
except OSError:
pass
harvest(str(tmp_bundle), dangerous=bool(args.dangerous))
harvest(
str(tmp_bundle),
dangerous=bool(args.dangerous),
include_paths=list(getattr(args, "include_path", []) or []),
exclude_paths=list(getattr(args, "exclude_path", []) or []),
)
_encrypt_harvest_dir_to_sops(
tmp_bundle, out_file, list(sops_fps)
)
@ -512,7 +577,12 @@ def main() -> None:
raise SystemExit(
"error: --harvest is required unless --remote-host is set"
)
harvest(args.harvest, dangerous=bool(args.dangerous))
harvest(
args.harvest,
dangerous=bool(args.dangerous),
include_paths=list(getattr(args, "include_path", []) or []),
exclude_paths=list(getattr(args, "exclude_path", []) or []),
)
manifest(
args.harvest,
args.out,

View file

@ -196,6 +196,12 @@ def _iter_managed_files(state: Dict[str, Any]) -> Iterable[Tuple[str, Dict[str,
for mf in ul.get("managed_files", []) or []:
yield str(ul_role), mf
# extra_paths
xp = state.get("extra_paths") or {}
xp_role = xp.get("role_name") or "extra_paths"
for mf in xp.get("managed_files", []) or []:
yield str(xp_role), mf
def _file_index(bundle_dir: Path, state: Dict[str, Any]) -> Dict[str, FileRec]:
"""Return mapping of absolute path -> FileRec.

View file

@ -19,6 +19,7 @@ from .debian import (
stat_triplet,
)
from .ignore import IgnorePolicy
from .pathfilter import PathFilter, expand_includes
from .accounts import collect_non_system_users
@ -86,6 +87,16 @@ class UsrLocalCustomSnapshot:
notes: List[str]
@dataclass
class ExtraPathsSnapshot:
role_name: str
include_patterns: List[str]
exclude_patterns: List[str]
managed_files: List[ManagedFile]
excluded: List[ExcludedFile]
notes: List[str]
ALLOWED_UNOWNED_EXTS = {
".conf",
".cfg",
@ -250,6 +261,8 @@ def harvest(
policy: Optional[IgnorePolicy] = None,
*,
dangerous: bool = False,
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None,
) -> str:
# If a policy is not supplied, build one. `--dangerous` relaxes secret
# detection and deny-glob skipping.
@ -261,6 +274,10 @@ def harvest(
policy.dangerous = True
os.makedirs(bundle_dir, exist_ok=True)
# User-provided includes/excludes. Excludes apply to all harvesting;
# includes are harvested into an extra role.
path_filter = PathFilter(include=include_paths or (), exclude=exclude_paths or ())
if hasattr(os, "geteuid") and os.geteuid() != 0:
print(
"Warning: not running as root; harvest may miss files or metadata.",
@ -406,6 +423,9 @@ def harvest(
)
for path, reason in sorted(candidates.items()):
if path_filter.is_excluded(path):
excluded.append(ExcludedFile(path=path, reason="user_excluded"))
continue
deny = policy.deny_reason(path)
if deny:
excluded.append(ExcludedFile(path=path, reason=deny))
@ -522,6 +542,9 @@ def harvest(
candidates.setdefault(r, "custom_specific_path")
for path, reason in sorted(candidates.items()):
if path_filter.is_excluded(path):
excluded.append(ExcludedFile(path=path, reason="user_excluded"))
continue
deny = policy.deny_reason(path)
if deny:
excluded.append(ExcludedFile(path=path, reason=deny))
@ -593,6 +616,9 @@ def harvest(
# Copy only safe SSH public material: authorized_keys + *.pub
for sf in u.ssh_files:
if path_filter.is_excluded(sf):
users_excluded.append(ExcludedFile(path=sf, reason="user_excluded"))
continue
deny = policy.deny_reason(sf)
if deny:
users_excluded.append(ExcludedFile(path=sf, reason=deny))
@ -665,6 +691,10 @@ def harvest(
if not _is_confish(path):
continue
if path_filter.is_excluded(path):
etc_excluded.append(ExcludedFile(path=path, reason="user_excluded"))
continue
deny = policy.deny_reason(path)
if deny:
etc_excluded.append(ExcludedFile(path=path, reason=deny))
@ -754,6 +784,10 @@ def harvest(
ul_excluded.append(ExcludedFile(path=path, reason="unreadable"))
continue
if path_filter.is_excluded(path):
ul_excluded.append(ExcludedFile(path=path, reason="user_excluded"))
continue
deny = policy.deny_reason(path)
if deny:
ul_excluded.append(ExcludedFile(path=path, reason=deny))
@ -806,6 +840,81 @@ def harvest(
notes=ul_notes,
)
# -------------------------
# extra_paths role (user-requested includes)
# -------------------------
extra_notes: List[str] = []
extra_excluded: List[ExcludedFile] = []
extra_managed: List[ManagedFile] = []
extra_role_name = "extra_paths"
include_specs = list(include_paths or [])
exclude_specs = list(exclude_paths or [])
if include_specs:
extra_notes.append("User include patterns:")
extra_notes.extend([f"- {p}" for p in include_specs])
if exclude_specs:
extra_notes.append("User exclude patterns:")
extra_notes.extend([f"- {p}" for p in exclude_specs])
included_files: List[str] = []
if include_specs:
files, inc_notes = expand_includes(
path_filter.iter_include_patterns(),
exclude=path_filter,
max_files=4000,
)
included_files = files
extra_notes.extend(inc_notes)
for path in included_files:
if path in already_all:
continue
if path_filter.is_excluded(path):
extra_excluded.append(ExcludedFile(path=path, reason="user_excluded"))
continue
deny = policy.deny_reason(path)
if deny:
extra_excluded.append(ExcludedFile(path=path, reason=deny))
continue
try:
owner, group, mode = stat_triplet(path)
except OSError:
extra_excluded.append(ExcludedFile(path=path, reason="unreadable"))
continue
src_rel = path.lstrip("/")
try:
_copy_into_bundle(bundle_dir, extra_role_name, path, src_rel)
except OSError:
extra_excluded.append(ExcludedFile(path=path, reason="unreadable"))
continue
extra_managed.append(
ManagedFile(
path=path,
src_rel=src_rel,
owner=owner,
group=group,
mode=mode,
reason="user_include",
)
)
already_all.add(path)
extra_paths_snapshot = ExtraPathsSnapshot(
role_name=extra_role_name,
include_patterns=include_specs,
exclude_patterns=exclude_specs,
managed_files=extra_managed,
excluded=extra_excluded,
notes=extra_notes,
)
state = {
"host": {"hostname": os.uname().nodename, "os": "debian"},
"users": asdict(users_snapshot),
@ -815,6 +924,7 @@ def harvest(
"package_roles": [asdict(p) for p in pkg_snaps],
"etc_custom": asdict(etc_custom_snapshot),
"usr_local_custom": asdict(usr_local_custom_snapshot),
"extra_paths": asdict(extra_paths_snapshot),
}
state_path = os.path.join(bundle_dir, "state.json")

View file

@ -630,6 +630,7 @@ def _manifest_from_bundle_dir(
users_snapshot: Dict[str, Any] = state.get("users", {})
etc_custom_snapshot: Dict[str, Any] = state.get("etc_custom", {})
usr_local_custom_snapshot: Dict[str, Any] = state.get("usr_local_custom", {})
extra_paths_snapshot: Dict[str, Any] = state.get("extra_paths", {})
site_mode = fqdn is not None and fqdn != ""
@ -663,6 +664,7 @@ def _manifest_from_bundle_dir(
manifested_users_roles: List[str] = []
manifested_etc_custom_roles: List[str] = []
manifested_usr_local_custom_roles: List[str] = []
manifested_extra_paths_roles: List[str] = []
manifested_service_roles: List[str] = []
manifested_pkg_roles: List[str] = []
@ -1098,6 +1100,118 @@ Unowned /etc config files not attributed to packages or services.
manifested_usr_local_custom_roles.append(role)
# -------------------------
# extra_paths role (user-requested includes)
# -------------------------
if extra_paths_snapshot and extra_paths_snapshot.get("managed_files"):
role = extra_paths_snapshot.get("role_name", "extra_paths")
role_dir = os.path.join(roles_root, role)
_write_role_scaffold(role_dir)
var_prefix = role
managed_files = extra_paths_snapshot.get("managed_files", [])
excluded = extra_paths_snapshot.get("excluded", [])
notes = extra_paths_snapshot.get("notes", [])
include_pats = extra_paths_snapshot.get("include_patterns", []) or []
exclude_pats = extra_paths_snapshot.get("exclude_patterns", []) or []
templated, jt_vars = _jinjify_managed_files(
bundle_dir,
role,
role_dir,
managed_files,
jt_exe=jt_exe,
jt_enabled=jt_enabled,
overwrite_templates=not site_mode,
)
if site_mode:
_copy_artifacts(
bundle_dir,
role,
_host_role_files_dir(out_dir, fqdn or "", role),
exclude_rels=templated,
)
else:
_copy_artifacts(
bundle_dir,
role,
os.path.join(role_dir, "files"),
exclude_rels=templated,
)
files_var = _build_managed_files_var(
managed_files,
templated,
notify_other=None,
notify_systemd=None,
)
jt_map = _yaml_load_mapping(jt_vars) if jt_vars.strip() else {}
vars_map: Dict[str, Any] = {f"{var_prefix}_managed_files": files_var}
vars_map = _merge_mappings_overwrite(vars_map, jt_map)
if site_mode:
_write_role_defaults(role_dir, {f"{var_prefix}_managed_files": []})
_write_hostvars(out_dir, fqdn or "", role, vars_map)
else:
_write_role_defaults(role_dir, vars_map)
tasks = "---\n" + _render_generic_files_tasks(
var_prefix, include_restart_notify=False
)
with open(
os.path.join(role_dir, "tasks", "main.yml"), "w", encoding="utf-8"
) as f:
f.write(tasks.rstrip() + "\n")
with open(
os.path.join(role_dir, "handlers", "main.yml"), "w", encoding="utf-8"
) as f:
f.write("---\n")
with open(
os.path.join(role_dir, "meta", "main.yml"), "w", encoding="utf-8"
) as f:
f.write("---\ndependencies: []\n")
readme = (
f"""# {role}
User-requested extra file harvesting.
## Include patterns
"""
+ ("\n".join([f"- {p}" for p in include_pats]) or "- (none)")
+ """\n
## Exclude patterns
"""
+ ("\n".join([f"- {p}" for p in exclude_pats]) or "- (none)")
+ """\n
## Managed files
"""
+ ("\n".join([f"- {mf.get('path')}" for mf in managed_files]) or "- (none)")
+ """\n
## Excluded
"""
+ (
"\n".join([f"- {e.get('path')} ({e.get('reason')})" for e in excluded])
or "- (none)"
)
+ """\n
## Notes
"""
+ ("\n".join([f"- {n}" for n in notes]) or "- (none)")
+ """\n"""
)
with open(os.path.join(role_dir, "README.md"), "w", encoding="utf-8") as f:
f.write(readme)
manifested_extra_paths_roles.append(role)
manifested_usr_local_custom_roles.append(role)
# -------------------------
# -------------------------
@ -1412,6 +1526,7 @@ Generated for package `{pkg}`.
+ manifested_service_roles
+ manifested_etc_custom_roles
+ manifested_usr_local_custom_roles
+ manifested_extra_paths_roles
+ manifested_users_roles
)

293
enroll/pathfilter.py Normal file
View file

@ -0,0 +1,293 @@
from __future__ import annotations
import glob
import os
import re
from dataclasses import dataclass
from pathlib import PurePosixPath
from typing import List, Optional, Sequence, Set, Tuple
_REGEX_PREFIXES = ("re:", "regex:")
def _has_glob_chars(s: str) -> bool:
return any(ch in s for ch in "*?[")
def _norm_abs(p: str) -> str:
"""Normalise a path-ish string to an absolute POSIX path.
We treat inputs that don't start with '/' as being relative to '/'.
"""
p = p.strip()
if not p:
return "/"
if not p.startswith("/"):
p = "/" + p
# `normpath` keeps a leading '/' for absolute paths.
return os.path.normpath(p)
def _posix_match(path: str, pattern: str) -> bool:
"""Path matching with glob semantics.
Uses PurePosixPath.match which:
- treats '/' as a segment separator
- supports '**' for recursive matching
Both `path` and `pattern` are treated as absolute paths.
"""
# PurePosixPath.match is anchored and works best on relative strings.
p = path.lstrip("/")
pat = pattern.lstrip("/")
try:
return PurePosixPath(p).match(pat)
except Exception:
# If the pattern is somehow invalid, fail closed.
return False
def _regex_literal_prefix(regex: str) -> str:
"""Best-effort literal prefix extraction for a regex.
This lets us pick a starting directory to walk when expanding regex-based
include patterns.
"""
s = regex
if s.startswith("^"):
s = s[1:]
out: List[str] = []
escaped = False
meta = set(".^$*+?{}[]\\|()")
for ch in s:
if escaped:
out.append(ch)
escaped = False
continue
if ch == "\\":
escaped = True
continue
if ch in meta:
break
out.append(ch)
return "".join(out)
@dataclass(frozen=True)
class CompiledPathPattern:
raw: str
kind: str # 'prefix' | 'glob' | 'regex'
value: str
regex: Optional[re.Pattern[str]] = None
def matches(self, path: str) -> bool:
p = _norm_abs(path)
if self.kind == "regex":
if not self.regex:
return False
# Search (not match) so users can write unanchored patterns.
return self.regex.search(p) is not None
if self.kind == "glob":
return _posix_match(p, self.value)
# prefix
pref = self.value.rstrip("/")
return p == pref or p.startswith(pref + "/")
def compile_path_pattern(raw: str) -> CompiledPathPattern:
s = raw.strip()
for pre in _REGEX_PREFIXES:
if s.startswith(pre):
rex = s[len(pre) :].strip()
try:
return CompiledPathPattern(
raw=raw, kind="regex", value=rex, regex=re.compile(rex)
)
except re.error:
# Treat invalid regexes as non-matching.
return CompiledPathPattern(raw=raw, kind="regex", value=rex, regex=None)
# If the user explicitly says glob:, honour it.
if s.startswith("glob:"):
pat = s[len("glob:") :].strip()
return CompiledPathPattern(raw=raw, kind="glob", value=_norm_abs(pat))
# Heuristic: if it contains glob metacharacters, treat as a glob.
if _has_glob_chars(s) or "**" in s:
return CompiledPathPattern(raw=raw, kind="glob", value=_norm_abs(s))
# Otherwise treat as an exact path-or-prefix (dir subtree).
return CompiledPathPattern(raw=raw, kind="prefix", value=_norm_abs(s))
@dataclass
class PathFilter:
"""User-provided path filters.
Semantics:
- exclude patterns always win
- include patterns are used only to expand *additional* files to harvest
(they do not restrict the default harvest set)
Patterns:
- By default: glob-like (supports '**')
- Regex: prefix with 're:' or 'regex:'
- Force glob: prefix with 'glob:'
- A plain path without wildcards matches that path and everything under it
(directory-prefix behavior).
Examples:
--exclude-path /usr/local/bin/docker-*
--include-path /home/*/.bashrc
--include-path 're:^/home/[^/]+/.config/myapp/.*$'
"""
include: Sequence[str] = ()
exclude: Sequence[str] = ()
def __post_init__(self) -> None:
self._include = [
compile_path_pattern(p) for p in self.include if str(p).strip()
]
self._exclude = [
compile_path_pattern(p) for p in self.exclude if str(p).strip()
]
def is_excluded(self, path: str) -> bool:
for pat in self._exclude:
if pat.matches(path):
return True
return False
def iter_include_patterns(self) -> List[CompiledPathPattern]:
return list(self._include)
def expand_includes(
patterns: Sequence[CompiledPathPattern],
*,
exclude: Optional[PathFilter] = None,
max_files: int = 4000,
) -> Tuple[List[str], List[str]]:
"""Expand include patterns into concrete file paths.
Returns (paths, notes). The returned paths are absolute paths.
This function is intentionally conservative:
- symlinks are ignored (both dirs and files)
- the number of collected files is capped
Regex patterns are expanded by walking a best-effort inferred root.
"""
out: List[str] = []
notes: List[str] = []
seen: Set[str] = set()
def _maybe_add_file(p: str) -> None:
if len(out) >= max_files:
return
p = _norm_abs(p)
if exclude and exclude.is_excluded(p):
return
if p in seen:
return
if not os.path.isfile(p) or os.path.islink(p):
return
seen.add(p)
out.append(p)
def _walk_dir(root: str, match: Optional[CompiledPathPattern] = None) -> None:
root = _norm_abs(root)
if not os.path.isdir(root) or os.path.islink(root):
return
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
# Prune excluded directories early.
if exclude:
dirnames[:] = [
d
for d in dirnames
if not exclude.is_excluded(os.path.join(dirpath, d))
and not os.path.islink(os.path.join(dirpath, d))
]
for fn in filenames:
if len(out) >= max_files:
return
p = os.path.join(dirpath, fn)
if os.path.islink(p) or not os.path.isfile(p):
continue
if exclude and exclude.is_excluded(p):
continue
if match is not None and not match.matches(p):
continue
if p in seen:
continue
seen.add(p)
out.append(_norm_abs(p))
for pat in patterns:
if len(out) >= max_files:
notes.append(
f"Include cap reached ({max_files}); some includes were not expanded."
)
break
matched_any = False
if pat.kind == "prefix":
p = pat.value
if os.path.isfile(p) and not os.path.islink(p):
_maybe_add_file(p)
matched_any = True
elif os.path.isdir(p) and not os.path.islink(p):
before = len(out)
_walk_dir(p)
matched_any = len(out) > before
else:
# Still allow prefix patterns that don't exist now (e.g. remote different)
# by matching nothing rather than erroring.
matched_any = False
elif pat.kind == "glob":
# Use glob for expansion; also walk directories that match.
gpat = pat.value
hits = glob.glob(gpat, recursive=True)
for h in hits:
if len(out) >= max_files:
break
h = _norm_abs(h)
if exclude and exclude.is_excluded(h):
continue
if os.path.isdir(h) and not os.path.islink(h):
before = len(out)
_walk_dir(h)
if len(out) > before:
matched_any = True
elif os.path.isfile(h) and not os.path.islink(h):
_maybe_add_file(h)
matched_any = True
else: # regex
rex = pat.value
prefix = _regex_literal_prefix(rex)
# Determine a walk root. If we can infer an absolute prefix, use its
# directory; otherwise fall back to '/'.
if prefix.startswith("/"):
root = os.path.dirname(prefix) or "/"
else:
root = "/"
before = len(out)
_walk_dir(root, match=pat)
matched_any = len(out) > before
if not matched_any:
notes.append(f"Include pattern matched no files: {pat.raw!r}")
return out, notes

View file

@ -1,6 +1,7 @@
from __future__ import annotations
import os
import shlex
import shutil
import tarfile
import tempfile
@ -97,6 +98,8 @@ def remote_harvest(
remote_python: str = "python3",
dangerous: bool = False,
no_sudo: bool = False,
include_paths: Optional[list[str]] = None,
exclude_paths: Optional[list[str]] = None,
) -> Path:
"""Run enroll harvest on a remote host via SSH and pull the bundle locally.
@ -165,13 +168,25 @@ def remote_harvest(
sftp.put(str(pyz), rapp)
# Run remote harvest.
_cmd = f"{remote_python} {rapp} harvest --out {rbundle}"
argv: list[str] = [
remote_python,
rapp,
"harvest",
"--out",
rbundle,
]
if dangerous:
argv.append("--dangerous")
for p in include_paths or []:
argv.extend(["--include-path", str(p)])
for p in exclude_paths or []:
argv.extend(["--exclude-path", str(p)])
_cmd = " ".join(shlex.quote(a) for a in argv)
if not no_sudo:
cmd = f"sudo {_cmd}"
else:
cmd = _cmd
if dangerous:
cmd += " --dangerous"
rc, out, err = _ssh_run(ssh, cmd)
if rc != 0:
raise RuntimeError(