From a1433d645f7b0964db05eb51d55c2de980ee19e5 Mon Sep 17 00:00:00 2001 From: Miguel Jacq Date: Mon, 5 Jan 2026 15:02:22 +1100 Subject: [PATCH] Capture other files in the user's home directory Such as `.bashrc`, `.bash_aliases`, `.profile`, if these files differ from the `/etc/skel` defaults --- CHANGELOG.md | 1 + enroll/harvest.py | 103 +++++++++++++++++++++++++++++++++++++++++++++ enroll/manifest.py | 7 ++- 3 files changed, 110 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c687249..19906cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ * Introduce `enroll explain` - a tool to analyze and explain what's in (or not in) a harvest and why. * Centralise the cron and logrotate stuff into their respective roles, we had a bit of duplication between roles based on harvest discovery. + * Capture other files in the user's home directory such as `.bashrc`, `.bash_aliases`, `.profile`, if these files differ from the `/etc/skel` defaults # 0.2.3 diff --git a/enroll/harvest.py b/enroll/harvest.py index 6ecf676..40fe284 100644 --- a/enroll/harvest.py +++ b/enroll/harvest.py @@ -5,6 +5,7 @@ import json import os import re import shutil +import stat import time from dataclasses import dataclass, asdict, field from typing import Dict, List, Optional, Set @@ -157,6 +158,54 @@ MAX_FILES_CAP = 4000 MAX_UNOWNED_FILES_PER_ROLE = 500 +def _files_differ(a: str, b: str, *, max_bytes: int = 2_000_000) -> bool: + """Return True if file `a` differs from file `b`. + + Best-effort and conservative: + - If `b` (baseline) does not exist or is not a regular file, treat as + "different" so we err on the side of capturing user state. + - If we can't stat/read either file, treat as "different" (capture will + later be filtered via IgnorePolicy). + - If files are large, avoid reading them fully. + """ + + try: + st_a = os.stat(a, follow_symlinks=True) + except OSError: + return True + + # Refuse to do content comparisons on non-regular files. + if not stat.S_ISREG(st_a.st_mode): + return True + + try: + st_b = os.stat(b, follow_symlinks=True) + except OSError: + return True + + if not stat.S_ISREG(st_b.st_mode): + return True + + if st_a.st_size != st_b.st_size: + return True + + # If it's unexpectedly big, treat as different to avoid expensive reads. + if st_a.st_size > max_bytes: + return True + + try: + with open(a, "rb") as fa, open(b, "rb") as fb: + while True: + ca = fa.read(1024 * 64) + cb = fb.read(1024 * 64) + if ca != cb: + return True + if not ca: # EOF on both + return False + except OSError: + return True + + def _merge_parent_dirs( existing_dirs: List[ManagedDir], managed_files: List[ManagedFile], @@ -1319,6 +1368,18 @@ def harvest( users_role_name = "users" users_role_seen = seen_by_role.setdefault(users_role_name, set()) + skel_dir = "/etc/skel" + # Dotfiles to harvest for non-system users. For the common "skeleton" + # files, only capture if the user's copy differs from /etc/skel. + skel_dotfiles = [ + (".bashrc", "user_shell_rc"), + (".profile", "user_profile"), + (".bash_logout", "user_shell_logout"), + ] + extra_dotfiles = [ + (".bash_aliases", "user_shell_aliases"), + ] + for u in user_records: users_list.append( { @@ -1353,6 +1414,48 @@ def harvest( seen_global=captured_global, ) + # Capture common per-user shell dotfiles when they differ from /etc/skel. + # These still go through IgnorePolicy and user path filters. + home = (u.home or "").rstrip("/") + if home and home.startswith("/"): + for rel, reason in skel_dotfiles: + upath = os.path.join(home, rel) + if not os.path.exists(upath): + continue + skel_path = os.path.join(skel_dir, rel) + if not _files_differ(upath, skel_path, max_bytes=policy.max_file_bytes): + continue + _capture_file( + bundle_dir=bundle_dir, + role_name=users_role_name, + abs_path=upath, + reason=reason, + policy=policy, + path_filter=path_filter, + managed_out=users_managed, + excluded_out=users_excluded, + seen_role=users_role_seen, + seen_global=captured_global, + ) + + # Capture other common per-user shell files unconditionally if present. + for rel, reason in extra_dotfiles: + upath = os.path.join(home, rel) + if not os.path.exists(upath): + continue + _capture_file( + bundle_dir=bundle_dir, + role_name=users_role_name, + abs_path=upath, + reason=reason, + policy=policy, + path_filter=path_filter, + managed_out=users_managed, + excluded_out=users_excluded, + seen_role=users_role_seen, + seen_global=captured_global, + ) + users_snapshot = UsersSnapshot( role_name=users_role_name, users=users_list, diff --git a/enroll/manifest.py b/enroll/manifest.py index ea38e98..b616fe6 100644 --- a/enroll/manifest.py +++ b/enroll/manifest.py @@ -819,7 +819,12 @@ def _manifest_from_bundle_dir( group = str(u.get("primary_group") or owner) break - mode = "0600" if mf.get("reason") == "authorized_keys" else "0644" + # Prefer the harvested file mode so we preserve any deliberate + # permissions (e.g. 0600 for certain dotfiles). For authorized_keys, + # enforce 0600 regardless. + mode = mf.get("mode") or "0644" + if mf.get("reason") == "authorized_keys": + mode = "0600" ssh_files.append( { "dest": dest,