diff --git a/README.md b/README.md index b711e9f..c5702f3 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,14 @@ stdout. However, it is possible to output the results to new files. ## What sort of config files can it handle? -TOML, YAML, INI and JSON style config files should be okay. There are always +TOML, YAML, INI, JSON and XML-style config files should be okay. There are always going to be some edge cases in very complex files that are difficult to work with, though, so you may still find that you need to tweak the results. +The tool does not do anything intelligent like detect common sections that +could practically be turned into 'for' loops in Jinja. You'd have to do those +sorts of optimisations yourself. + The goal here is really to *speed up* converting files into Ansible/Jinja2, but not necessarily to make it perfect. @@ -68,7 +72,7 @@ jinjaturtle php.ini \ ## Full usage info ``` -usage: jinjaturtle [-h] -r ROLE_NAME [-f {json,ini,toml,yaml}] [-d DEFAULTS_OUTPUT] [-t TEMPLATE_OUTPUT] config +usage: jinjaturtle [-h] -r ROLE_NAME [-f {json,ini,toml,yaml,xml}] [-d DEFAULTS_OUTPUT] [-t TEMPLATE_OUTPUT] config Convert a config file into an Ansible defaults file and Jinja2 template. diff --git a/src/jinjaturtle/cli.py b/src/jinjaturtle/cli.py index 9b13502..8158cf4 100644 --- a/src/jinjaturtle/cli.py +++ b/src/jinjaturtle/cli.py @@ -30,7 +30,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: ap.add_argument( "-f", "--format", - choices=["ini", "json", "toml", "yaml"], + choices=["ini", "json", "toml", "yaml", "xml"], help="Force config format instead of auto-detecting from filename.", ) ap.add_argument( diff --git a/src/jinjaturtle/core.py b/src/jinjaturtle/core.py index bc5f822..a4dce7e 100644 --- a/src/jinjaturtle/core.py +++ b/src/jinjaturtle/core.py @@ -2,9 +2,12 @@ from __future__ import annotations import configparser import json +import xml.etree.ElementTree as ET +import yaml + +from collections import Counter, defaultdict from pathlib import Path from typing import Any, Iterable -import yaml try: import tomllib # Python 3.11+ @@ -46,7 +49,7 @@ _TurtleDumper.add_representer(None, _fallback_str_representer) def detect_format(path: Path, explicit: str | None = None) -> str: """ - Determine config format (toml, yaml, json, ini-ish) from argument or filename. + Determine config format (toml, yaml, json, ini-ish, xml) from argument or filename. """ if explicit: return explicit @@ -60,6 +63,8 @@ def detect_format(path: Path, explicit: str | None = None) -> str: return "json" if suffix in {".ini", ".cfg", ".conf"} or name.endswith(".ini"): return "ini" + if suffix == ".xml": + return "xml" # Fallback: treat as INI-ish return "ini" @@ -96,9 +101,74 @@ def parse_config(path: Path, fmt: str | None = None) -> tuple[str, Any]: parser.read_file(f) return fmt, parser + if fmt == "xml": + text = path.read_text(encoding="utf-8") + parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False)) + root = ET.fromstring(text, parser=parser) + return fmt, root + raise ValueError(f"Unsupported config format: {fmt}") +def _flatten_xml(root: ET.Element) -> list[tuple[tuple[str, ...], Any]]: + """ + Flatten an XML tree into (path, value) pairs. + + Path conventions: + - Root element's children are treated as top-level (root tag is *not* included). + - Element text: + bar -> path ("foo",) value "bar" + bar -> path ("foo", "value") value "bar" + baz -> ("foo", "bar") / etc. + - Attributes: + + -> path ("server", "@host") value "localhost" + - Repeated sibling elements: + /a + /b + -> ("endpoint", "0") "/a" + ("endpoint", "1") "/b" + """ + items: list[tuple[tuple[str, ...], Any]] = [] + + def walk(elem: ET.Element, path: tuple[str, ...]) -> None: + # Attributes + for attr_name, attr_val in elem.attrib.items(): + attr_path = path + (f"@{attr_name}",) + items.append((attr_path, attr_val)) + + # Children + children = [c for c in list(elem) if isinstance(c.tag, str)] + + # Text content + text = (elem.text or "").strip() + if text: + if not elem.attrib and not children: + # Simple bar + items.append((path, text)) + else: + # Text alongside attrs/children + items.append((path + ("value",), text)) + + # Repeated siblings get an index; singletons just use the tag + counts = Counter(child.tag for child in children) + index_counters: dict[str, int] = defaultdict(int) + + for child in children: + tag = child.tag + if counts[tag] > 1: + idx = index_counters[tag] + index_counters[tag] += 1 + child_path = path + (tag, str(idx)) + else: + child_path = path + (tag,) + walk(child, child_path) + + # Treat root as a container: its children are top-level + walk(root, ()) + return items + + def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]: """ Flatten parsed config into a list of (path_tuple, value). @@ -141,6 +211,12 @@ def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]: else: processed = raw items.append(((section, key), processed)) + + elif fmt == "xml": + if not isinstance(parsed, ET.Element): + raise TypeError("XML parser result must be an Element") + items = _flatten_xml(parsed) + else: # pragma: no cover raise ValueError(f"Unsupported format: {fmt}") @@ -677,6 +753,135 @@ def _generate_json_template(role_prefix: str, data: Any) -> str: return json.dumps(templated, indent=2, ensure_ascii=False) + "\n" +def _split_xml_prolog(text: str) -> tuple[str, str]: + """ + Split an XML document into (prolog, body), where prolog includes: + - XML declaration () + - top-level comments + - DOCTYPE + The body starts at the root element. + """ + i = 0 + n = len(text) + prolog_parts: list[str] = [] + + while i < n: + # Preserve leading whitespace + while i < n and text[i].isspace(): + prolog_parts.append(text[i]) + i += 1 + if i >= n: + break + + if text.startswith("", i + 2) + if end == -1: + break + prolog_parts.append(text[i : end + 2]) + i = end + 2 + continue + + if text.startswith("", i + 4) + if end == -1: + break + prolog_parts.append(text[i : end + 3]) + i = end + 3 + continue + + if text.startswith("", i + 9) + if end == -1: + break + prolog_parts.append(text[i : end + 1]) + i = end + 1 + continue + + if text[i] == "<": + # Assume root element starts here + break + + # Unexpected content: stop treating as prolog + break + + return "".join(prolog_parts), text[i:] + + +def _apply_jinja_to_xml_tree(role_prefix: str, root: ET.Element) -> None: + """ + Mutate the XML tree in-place, replacing scalar values with Jinja + expressions based on the same paths used in _flatten_xml. + """ + + def walk(elem: ET.Element, path: tuple[str, ...]) -> None: + # Attributes + for attr_name in list(elem.attrib.keys()): + attr_path = path + (f"@{attr_name}",) + var_name = make_var_name(role_prefix, attr_path) + elem.set(attr_name, f"{{{{ {var_name} }}}}") + + # Children + children = [c for c in list(elem) if isinstance(c.tag, str)] + + # Text content + text = (elem.text or "").strip() + if text: + if not elem.attrib and not children: + text_path = path + else: + text_path = path + ("value",) + var_name = make_var_name(role_prefix, text_path) + elem.text = f"{{{{ {var_name} }}}}" + + # Repeated children get indexes just like in _flatten_xml + counts = Counter(child.tag for child in children) + index_counters: dict[str, int] = defaultdict(int) + + for child in children: + tag = child.tag + if counts[tag] > 1: + idx = index_counters[tag] + index_counters[tag] += 1 + child_path = path + (tag, str(idx)) + else: + child_path = path + (tag,) + walk(child, child_path) + + walk(root, ()) + + +def _generate_xml_template_from_text(role_prefix: str, text: str) -> str: + """ + Generate a Jinja2 template for an XML file, preserving comments and prolog. + + - Attributes become Jinja placeholders: + + -> + + - Text nodes become placeholders: + 8080 + -> {{ prefix_port }} + + but if the element also has attributes/children, the value path + gets a trailing "value" component, matching flattening. + """ + prolog, body = _split_xml_prolog(text) + + # Parse with comments included so are preserved + parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True)) + root = ET.fromstring(body, parser=parser) + + _apply_jinja_to_xml_tree(role_prefix, root) + + # Pretty indentation if available (Python 3.9+) + indent = getattr(ET, "indent", None) + if indent is not None: + indent(root, space=" ") # type: ignore[arg-type] + + xml_body = ET.tostring(root, encoding="unicode") + return prolog + xml_body + + def generate_template( fmt: str, parsed: Any, @@ -698,11 +903,13 @@ def generate_template( return _generate_ini_template_from_text(role_prefix, original_text) if fmt == "yaml": return _generate_yaml_template_from_text(role_prefix, original_text) + if fmt == "xml": + return _generate_xml_template_from_text(role_prefix, original_text) # For JSON we ignore original_text and reconstruct from parsed structure below if fmt != "json": raise ValueError(f"Unsupported format: {fmt}") - # Fallback: previous behaviour (no comments preserved) + # Fallback: no comments preserved if fmt == "toml": if not isinstance(parsed, dict): raise TypeError("TOML parser result must be a dict") @@ -721,4 +928,9 @@ def generate_template( if not isinstance(parsed, (dict, list)): raise TypeError("JSON parser result must be a dict or list") return _generate_json_template(role_prefix, parsed) + if fmt == "xml": + if not isinstance(parsed, ET.Element): + raise TypeError("XML parser result must be an Element") + xml_str = ET.tostring(parsed, encoding="unicode") + return _generate_xml_template_from_text(role_prefix, xml_str) raise ValueError(f"Unsupported format: {fmt}") diff --git a/tests/samples/ossec.xml b/tests/samples/ossec.xml new file mode 100644 index 0000000..a49a9d8 --- /dev/null +++ b/tests/samples/ossec.xml @@ -0,0 +1,225 @@ + + + + + + web-log + Access log messages grouped. + + + + 31100 + ^2|^3 + is_simple_http_request + Ignored URLs (simple queries). + + + + 31100 + ^4 + Web server 400 error code. + + + + 31101 + \.jpg$|\.gif$|favicon\.ico$|\.png$|robots\.txt$|\.css$|\.js$|\.jpeg$ + is_simple_http_request + Ignored extensions on 400 error codes. + + + + 31100,31108 + =select%20|select\+|insert%20|%20from%20|%20where%20|union%20| + union\+|where\+|null,null|xp_cmdshell + SQL injection attempt. + attack,sql_injection, + + + + 31100 + + + %027|%00|%01|%7f|%2E%2E|%0A|%0D|\.\./\.\.|\.\.\\\.\.|echo;| + cmd\.exe|root\.exe|_mem_bin|msadc|/winnt/|/boot\.ini| + /x90/|default\.ida|/sumthin|nsiislog\.dll|chmod%|wget%|cd%20| + exec%20|\.\./\.\.//|%5C\.\./%5C|\./\./\./\./|2e%2e%5c%2e|\\x5C\\x5C + Common web attack. + attack, + + + + 31100 + %3Cscript|%3C%2Fscript|script>|script%3E|SRC=javascript|IMG%20| + %20ONLOAD=|INPUT%20|iframe%20 + XSS (Cross Site Scripting) attempt. + attack, + + + + 31103, 31104, 31105 + ^200 + A web attack returned code 200 (success). + attack, + + + + 31100 + \?-d|\?-s|\?-a|\?-b|\?-w + PHP CGI-bin vulnerability attempt. + attack, + + + + 31100 + \+as\+varchar + %2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\) + MSSQL Injection attempt (/ur.php, urchin.js) + attack, + + + + + + 31103, 31104, 31105 + ^/search\.php\?search=|^/index\.php\?searchword= + Ignored URLs for the web attacks + + + + 31100 + URL too long. Higher than allowed on most + browsers. Possible attack. + invalid_access, + + + + + + 31100 + ^50 + Web server 500 error code (server error). + + + + 31120 + ^501 + Web server 501 error code (Not Implemented). + + + + 31120 + ^500 + alert_by_email + Web server 500 error code (Internal Error). + system_error, + + + + 31120 + ^503 + alert_by_email + Web server 503 error code (Service unavailable). + + + + + + 31101 + is_valid_crawler + Ignoring google/msn/yahoo bots. + + + + + 31101 + ^499 + Ignored 499's on nginx. + + + + + 31101 + + Multiple web server 400 error codes + from same source ip. + web_scan,recon, + + + + 31103 + + Multiple SQL injection attempts from same + source ip. + attack,sql_injection, + + + + 31104 + + Multiple common web attacks from same source ip. + attack, + + + + 31105 + + Multiple XSS (Cross Site Scripting) attempts + from same source ip. + attack, + + + + 31121 + + Multiple web server 501 error code (Not Implemented). + web_scan,recon, + + + + 31122 + + Multiple web server 500 error code (Internal Error). + system_error, + + + + 31123 + + Multiple web server 503 error code (Service unavailable). + web_scan,recon, + + + + 31100 + =%27|select%2B|insert%2B|%2Bfrom%2B|%2Bwhere%2B|%2Bunion%2B + SQL injection attempt. + attack,sqlinjection, + + + + 31100 + %EF%BC%87|%EF%BC%87|%EF%BC%87|%2531|%u0053%u0045 + SQL injection attempt. + attack,sqlinjection, + + + diff --git a/tests/test_core.py b/tests/test_core.py index 7cfee90..8e65697 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -5,6 +5,7 @@ import configparser import pytest import textwrap import yaml +import xml.etree.ElementTree as ET import jinjaturtle.core as core from jinjaturtle.core import ( @@ -147,12 +148,15 @@ def test_formats_match_expected_extensions(): """ toml_path = SAMPLES_DIR / "tom.toml" ini_path = SAMPLES_DIR / "php.ini" + xml_path = SAMPLES_DIR / "ossec.xml" fmt_toml, _ = parse_config(toml_path) fmt_ini, _ = parse_config(ini_path) + fmt_xml, _ = parse_config(xml_path) assert fmt_toml == "toml" assert fmt_ini == "ini" + assert fmt_xml == "xml" def test_parse_config_toml_missing_tomllib(monkeypatch): @@ -442,3 +446,210 @@ def test_fallback_str_representer_for_unknown_type(): # It should serialize without error, and the string form should appear. assert "weird-value" in dumped + + +def test_xml_roundtrip_ossec_web_rules(): + xml_path = SAMPLES_DIR / "ossec.xml" + assert xml_path.is_file(), f"Missing sample XML file: {xml_path}" + + fmt, parsed = parse_config(xml_path) + assert fmt == "xml" + + flat_items = flatten_config(fmt, parsed) + assert flat_items, "Expected at least one flattened item from XML sample" + + defaults_yaml = generate_defaults_yaml("ossec", flat_items) + defaults = yaml.safe_load(defaults_yaml) + + # defaults should be a non-empty dict + assert isinstance(defaults, dict) + assert defaults, "Expected non-empty defaults for XML sample" + + # all keys should be lowercase, start with prefix, and have no spaces + for key in defaults: + assert key.startswith("ossec_") + assert key == key.lower() + assert " " not in key + + # Root attribute should flatten to ossec_name + assert defaults["ossec_name"] == "web,accesslog," + + # There should be at least one default for rule id="31100" + id_keys = [k for k, v in defaults.items() if v == "31100"] + assert id_keys, "Expected to find a default for rule id 31100" + + # At least one of them should be the rule *id* attribute + assert any( + key.startswith("ossec_rule_") and key.endswith("_id") for key in id_keys + ), f"Expected at least one *_id var for value 31100, got: {id_keys}" + + # Template generation (preserving comments) + original_text = xml_path.read_text(encoding="utf-8") + template = generate_template(fmt, parsed, "ossec", original_text=original_text) + assert isinstance(template, str) + assert template.strip(), "Template for XML sample should not be empty" + + # Top-of-file and mid-file comments should be preserved + assert "Official Web access rules for OSSEC." in template + assert "Rules to ignore crawlers" in template + + # Each default variable name should appear in the template as a Jinja placeholder + for var_name in defaults: + assert ( + var_name in template + ), f"Variable {var_name} not referenced in XML template" + + +def test_generate_xml_template_from_text_edge_cases(): + """ + Exercise XML text edge cases: + - XML declaration and DOCTYPE in prolog + - top-level and inner comments + - repeated child elements (indexing) + - attributes and text content + """ + text = textwrap.dedent( + """\ + + + + + + text + other + + """ + ) + + tmpl = core._generate_xml_template_from_text("role", text) + + # Prolog and comments preserved + assert " role_attr) + assert "role_attr" in tmpl + + # Repeated elements should be indexed in both attr and text + assert "role_child_0_attr" in tmpl + assert "role_child_0" in tmpl + assert "role_child_1" in tmpl + + +def test_generate_template_xml_type_error(): + """ + Wrong type for XML in generate_template should raise TypeError. + """ + with pytest.raises(TypeError): + generate_template("xml", parsed="not an element", role_prefix="role") + + +def test_flatten_config_xml_type_error(): + """ + Wrong type for XML in flatten_config should raise TypeError. + """ + with pytest.raises(TypeError): + flatten_config("xml", parsed="not-an-element") + + +def test_generate_template_xml_structural_fallback(): + """ + When original_text is not provided for XML, generate_template should use + the structural fallback path (ET.tostring + _generate_xml_template_from_text). + """ + xml_text = textwrap.dedent( + """\ + + 2 + text + + """ + ) + parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False)) + root = ET.fromstring(xml_text, parser=parser) + + tmpl = generate_template("xml", parsed=root, role_prefix="role") + + # Root attribute path ("@attr",) -> role_attr + assert "role_attr" in tmpl + + # Simple child element text ("child",) -> role_child + assert "role_child" in tmpl + + # Element with both attr and text: + # - attr -> ("node", "@attr") -> role_node_attr + # - text -> ("node", "value") -> role_node_value + assert "role_node_attr" in tmpl + assert "role_node_value" in tmpl + + +def test_split_xml_prolog_only_whitespace(): + """ + Whitespace-only input: prolog is the whitespace, body is empty. + Exercises the 'if i >= n: break' path. + """ + text = " \n\t" + prolog, body = core._split_xml_prolog(text) + assert prolog == text + assert body == "" + + +def test_split_xml_prolog_unterminated_declaration(): + """ + Unterminated XML declaration should hit the 'end == -1' branch and + treat the whole string as body. + """ + text = "" + prolog, body = core._split_xml_prolog(text) + assert prolog == "" + assert body == text + + +def test_flatten_xml_text_with_attributes_uses_value_suffix(): + """ + When an element has both attributes and text, _flatten_xml should store + the text at path + ('value',), not just path. + """ + xml_text = "text" + parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False)) + root = ET.fromstring(xml_text, parser=parser) + + items = flatten_config("xml", root) + + # Attribute path: ("node", "@attr") -> "x" + assert (("node", "@attr"), "x") in items + + # Text-with-attrs path: ("node", "value") -> "text" + assert (("node", "value"), "text") in items