enroll/enroll/ignore.py
Miguel Jacq 4660a0703e
All checks were successful
CI / test (push) Successful in 5m43s
Lint / test (push) Successful in 30s
Trivy / test (push) Successful in 19s
Include files from /usr/local/bin and /usr/local/etc in harvest (assuming they aren't binaries or symlinks) and store in usr_local_custom role, similar to etc_custom.
2025-12-18 17:11:04 +11:00

116 lines
3.1 KiB
Python

from __future__ import annotations
import fnmatch
import os
import re
from dataclasses import dataclass
from typing import Optional
DEFAULT_DENY_GLOBS = [
# Common backup copies created by passwd tools (can contain sensitive data)
"/etc/passwd-",
"/etc/group-",
"/etc/shadow-",
"/etc/gshadow-",
"/etc/subuid-",
"/etc/subgid-",
"/etc/*shadow-",
"/etc/*gshadow-",
"/etc/ssl/private/*",
"/etc/ssh/ssh_host_*",
"/etc/shadow",
"/etc/gshadow",
"/etc/*shadow",
"/etc/letsencrypt/*",
"/usr/local/etc/ssl/private/*",
"/usr/local/etc/ssh/ssh_host_*",
"/usr/local/etc/*shadow",
"/usr/local/etc/*gshadow",
"/usr/local/etc/letsencrypt/*",
]
SENSITIVE_CONTENT_PATTERNS = [
re.compile(rb"-----BEGIN (RSA |EC |OPENSSH |)PRIVATE KEY-----"),
re.compile(rb"(?i)\bpassword\s*="),
re.compile(rb"(?i)\b(pass|passwd|token|secret|api[_-]?key)\b"),
]
COMMENT_PREFIXES = (b"#", b";", b"//")
BLOCK_START = b"/*"
BLOCK_END = b"*/"
@dataclass
class IgnorePolicy:
deny_globs: Optional[list[str]] = None
max_file_bytes: int = 256_000
sample_bytes: int = 64_000
# If True, be much less conservative about collecting potentially
# sensitive files. This disables deny globs (e.g. /etc/shadow,
# /etc/ssl/private/*) and skips heuristic content scanning.
dangerous: bool = False
def __post_init__(self) -> None:
if self.deny_globs is None:
self.deny_globs = list(DEFAULT_DENY_GLOBS)
def iter_effective_lines(self, content: bytes):
in_block = False
for raw in content.splitlines():
line = raw.lstrip()
if in_block:
if BLOCK_END in line:
in_block = False
continue
if not line:
continue
if line.startswith(BLOCK_START):
in_block = True
continue
if line.startswith(COMMENT_PREFIXES) or line.startswith(b"*"):
continue
yield raw
def deny_reason(self, path: str) -> Optional[str]:
# Always ignore plain *.log files (rarely useful as config, often noisy).
if path.endswith(".log"):
return "log_file"
if not self.dangerous:
for g in self.deny_globs or []:
if fnmatch.fnmatch(path, g):
return "denied_path"
try:
st = os.stat(path, follow_symlinks=True)
except OSError:
return "unreadable"
if st.st_size > self.max_file_bytes:
return "too_large"
if not os.path.isfile(path) or os.path.islink(path):
return "not_regular_file"
try:
with open(path, "rb") as f:
data = f.read(min(self.sample_bytes, st.st_size))
except OSError:
return "unreadable"
if b"\x00" in data:
return "binary_like"
if not self.dangerous:
for line in self.iter_effective_lines(data):
for pat in SENSITIVE_CONTENT_PATTERNS:
if pat.search(line):
return "sensitive_content"
return None