Add support for XML

2025-11-27 14:26:48 +11:00 · 2025-11-27 14:26:48 +11:00 · 24f7dbea02
commit 24f7dbea02
parent 022990a337
5 changed files with 662 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -25,10 +25,14 @@ stdout. However, it is possible to output the results to new files.
 ## What sort of config files can it handle?
-TOML, YAML, INI and JSON style config files should be okay. There are always
+TOML, YAML, INI, JSON and XML-style config files should be okay. There are always
 going to be some edge cases in very complex files that are difficult to work
 with, though, so you may still find that you need to tweak the results.
 The tool does not do anything intelligent like detect common sections that
 could practically be turned into 'for' loops in Jinja. You'd have to do those
 sorts of optimisations yourself.
 The goal here is really to *speed up* converting files into Ansible/Jinja2,
 but not necessarily to make it perfect.
@ -68,7 +72,7 @@ jinjaturtle php.ini \
 ## Full usage info
 ```
-usage: jinjaturtle [-h] -r ROLE_NAME [-f {json,ini,toml,yaml}] [-d DEFAULTS_OUTPUT] [-t TEMPLATE_OUTPUT] config
+usage: jinjaturtle [-h] -r ROLE_NAME [-f {json,ini,toml,yaml,xml}] [-d DEFAULTS_OUTPUT] [-t TEMPLATE_OUTPUT] config
 Convert a config file into an Ansible defaults file and Jinja2 template.
--- a/src/jinjaturtle/cli.py
+++ b/src/jinjaturtle/cli.py
@ -30,7 +30,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
    ap.add_argument(
        "-f",
        "--format",
-        choices=["ini", "json", "toml", "yaml"],
+        choices=["ini", "json", "toml", "yaml", "xml"],
        help="Force config format instead of auto-detecting from filename.",
    )
    ap.add_argument(
--- a/src/jinjaturtle/core.py
+++ b/src/jinjaturtle/core.py
@ -2,9 +2,12 @@ from __future__ import annotations
 import configparser
 import json
 import xml.etree.ElementTree as ET
 import yaml
 from collections import Counter, defaultdict
 from pathlib import Path
 from typing import Any, Iterable
 import yaml
 try:
    import tomllib  # Python 3.11+
@ -46,7 +49,7 @@ _TurtleDumper.add_representer(None, _fallback_str_representer)
 def detect_format(path: Path, explicit: str | None = None) -> str:
    """
-    Determine config format (toml, yaml, json, ini-ish) from argument or filename.
+    Determine config format (toml, yaml, json, ini-ish, xml) from argument or filename.
    """
    if explicit:
        return explicit
@ -60,6 +63,8 @@ def detect_format(path: Path, explicit: str | None = None) -> str:
        return "json"
    if suffix in {".ini", ".cfg", ".conf"} or name.endswith(".ini"):
        return "ini"
    if suffix == ".xml":
        return "xml"
    # Fallback: treat as INI-ish
    return "ini"
@ -96,9 +101,76 @@ def parse_config(path: Path, fmt: str | None = None) -> tuple[str, Any]:
            parser.read_file(f)
        return fmt, parser
    if fmt == "xml":
        # Parse XML into an ElementTree Element.
        # We do NOT insert comments here so flattening stays simple.
        text = path.read_text(encoding="utf-8")
        parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False))
        root = ET.fromstring(text, parser=parser)
        return fmt, root
    raise ValueError(f"Unsupported config format: {fmt}")
 def _flatten_xml(root: ET.Element) -> list[tuple[tuple[str, ...], Any]]:
    """
    Flatten an XML tree into (path, value) pairs.
    Path conventions:
      - Root element's children are treated as top-level (root tag is *not* included).
      - Element text:
          <foo>bar</foo>           -> path ("foo",)          value "bar"
          <foo attr="x">bar</foo>  -> path ("foo", "value")  value "bar"
          <foo><bar>baz</bar></foo> -> ("foo", "bar") / etc.
      - Attributes:
          <server host="localhost">
            -> path ("server", "@host") value "localhost"
      - Repeated sibling elements:
          <endpoint>/a</endpoint>
          <endpoint>/b</endpoint>
            -> ("endpoint", "0") "/a"
               ("endpoint", "1") "/b"
    """
    items: list[tuple[tuple[str, ...], Any]] = []
    def walk(elem: ET.Element, path: tuple[str, ...]) -> None:
        # Attributes
        for attr_name, attr_val in elem.attrib.items():
            attr_path = path + (f"@{attr_name}",)
            items.append((attr_path, attr_val))
        # Children (exclude comments if any got in here)
        children = [c for c in list(elem) if isinstance(c.tag, str)]
        # Text content
        text = (elem.text or "").strip()
        if text:
            if not elem.attrib and not children:
                # Simple <foo>bar</foo>
                items.append((path, text))
            else:
                # Text alongside attrs/children
                items.append((path + ("value",), text))
        # Repeated siblings get an index; singletons just use the tag
        counts = Counter(child.tag for child in children)
        index_counters: dict[str, int] = defaultdict(int)
        for child in children:
            tag = child.tag
            if counts[tag] > 1:
                idx = index_counters[tag]
                index_counters[tag] += 1
                child_path = path + (tag, str(idx))
            else:
                child_path = path + (tag,)
            walk(child, child_path)
    # Treat root as a container: its children are top-level
    walk(root, ())
    return items
 def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]:
    """
    Flatten parsed config into a list of (path_tuple, value).
@ -141,6 +213,12 @@ def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]:
                else:
                    processed = raw
                items.append(((section, key), processed))
    elif fmt == "xml":
        if not isinstance(parsed, ET.Element):
            raise TypeError("XML parser result must be an Element")
        items = _flatten_xml(parsed)
    else:  # pragma: no cover
        raise ValueError(f"Unsupported format: {fmt}")
@ -677,6 +755,135 @@ def _generate_json_template(role_prefix: str, data: Any) -> str:
    return json.dumps(templated, indent=2, ensure_ascii=False) + "\n"
 def _split_xml_prolog(text: str) -> tuple[str, str]:
    """
    Split an XML document into (prolog, body), where prolog includes:
      - XML declaration (<?xml ...?>)
      - top-level comments
      - DOCTYPE
    The body starts at the root element.
    """
    i = 0
    n = len(text)
    prolog_parts: list[str] = []
    while i < n:
        # Preserve leading whitespace
        while i < n and text[i].isspace():
            prolog_parts.append(text[i])
            i += 1
        if i >= n:
            break
        if text.startswith("<?", i):
            end = text.find("?>", i + 2)
            if end == -1:
                break
            prolog_parts.append(text[i : end + 2])
            i = end + 2
            continue
        if text.startswith("<!--", i):
            end = text.find("-->", i + 4)
            if end == -1:
                break
            prolog_parts.append(text[i : end + 3])
            i = end + 3
            continue
        if text.startswith("<!DOCTYPE", i):
            end = text.find(">", i + 9)
            if end == -1:
                break
            prolog_parts.append(text[i : end + 1])
            i = end + 1
            continue
        if text[i] == "<":
            # Assume root element starts here
            break
        # Unexpected content: stop treating as prolog
        break
    return "".join(prolog_parts), text[i:]
 def _apply_jinja_to_xml_tree(role_prefix: str, root: ET.Element) -> None:
    """
    Mutate the XML tree in-place, replacing scalar values with Jinja
    expressions based on the same paths used in _flatten_xml.
    """
    def walk(elem: ET.Element, path: tuple[str, ...]) -> None:
        # Attributes
        for attr_name in list(elem.attrib.keys()):
            attr_path = path + (f"@{attr_name}",)
            var_name = make_var_name(role_prefix, attr_path)
            elem.set(attr_name, f"{{{{ {var_name} }}}}")
        # Children (exclude comments)
        children = [c for c in list(elem) if isinstance(c.tag, str)]
        # Text content
        text = (elem.text or "").strip()
        if text:
            if not elem.attrib and not children:
                text_path = path
            else:
                text_path = path + ("value",)
            var_name = make_var_name(role_prefix, text_path)
            elem.text = f"{{{{ {var_name} }}}}"
        # Repeated children get indexes just like in _flatten_xml
        counts = Counter(child.tag for child in children)
        index_counters: dict[str, int] = defaultdict(int)
        for child in children:
            tag = child.tag
            if counts[tag] > 1:
                idx = index_counters[tag]
                index_counters[tag] += 1
                child_path = path + (tag, str(idx))
            else:
                child_path = path + (tag,)
            walk(child, child_path)
    walk(root, ())
 def _generate_xml_template_from_text(role_prefix: str, text: str) -> str:
    """
    Generate a Jinja2 template for an XML file, preserving comments and prolog.
    - Attributes become Jinja placeholders:
        <server host="localhost" />
          -> <server host="{{ prefix_server_host }}" />
    - Text nodes become placeholders:
        <port>8080</port>
          -> <port>{{ prefix_port }}</port>
      but if the element also has attributes/children, the value path
      gets a trailing "value" component, matching flattening.
    """
    prolog, body = _split_xml_prolog(text)
    # Parse with comments included so <!-- --> are preserved
    parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True))
    root = ET.fromstring(body, parser=parser)
    _apply_jinja_to_xml_tree(role_prefix, root)
    # Pretty indentation if available (Python 3.9+)
    indent = getattr(ET, "indent", None)
    if indent is not None:
        indent(root, space="  ")  # type: ignore[arg-type]
    xml_body = ET.tostring(root, encoding="unicode")
    return prolog + xml_body
 def generate_template(
    fmt: str,
    parsed: Any,
@ -698,11 +905,13 @@ def generate_template(
            return _generate_ini_template_from_text(role_prefix, original_text)
        if fmt == "yaml":
            return _generate_yaml_template_from_text(role_prefix, original_text)
        if fmt == "xml":
            return _generate_xml_template_from_text(role_prefix, original_text)
        # For JSON we ignore original_text and reconstruct from parsed structure below
        if fmt != "json":
            raise ValueError(f"Unsupported format: {fmt}")
-    # Fallback: previous behaviour (no comments preserved)
+    # Fallback: no comments preserved
    if fmt == "toml":
        if not isinstance(parsed, dict):
            raise TypeError("TOML parser result must be a dict")
@ -721,4 +930,11 @@ def generate_template(
        if not isinstance(parsed, (dict, list)):
            raise TypeError("JSON parser result must be a dict or list")
        return _generate_json_template(role_prefix, parsed)
    if fmt == "xml":
        if not isinstance(parsed, ET.Element):
            raise TypeError("XML parser result must be an Element")
        # We don't have original_text, so comments are already lost.
        # Re-serialise and run through the same templating path.
        xml_str = ET.tostring(parsed, encoding="unicode")
        return _generate_xml_template_from_text(role_prefix, xml_str)
    raise ValueError(f"Unsupported format: {fmt}")
--- a/tests/samples/ossec.xml
+++ b/tests/samples/ossec.xml
@ -0,0 +1,225 @@
 <!-- @(#) $Id: ./etc/rules/web_rules.xml, 2013/02/28 dcid Exp $
  -
  -  Official Web access rules for OSSEC.
  -
  -  Copyright (C) 2009 Trend Micro Inc.
  -  All rights reserved.
  -
  -  This program is a free software; you can redistribute it
  -  and/or modify it under the terms of the GNU General Public
  -  License (version 2) as published by the FSF - Free Software
  -  Foundation.
  -
  -  License details: http://www.ossec.net/en/licensing.html
  -->
 <group name="web,accesslog,">
  <rule id="31100" level="0">
    <category>web-log</category>
    <description>Access log messages grouped.</description>
  </rule>
  <rule id="31108" level="0">
    <if_sid>31100</if_sid>
    <id_pcre2>^2|^3</id_pcre2>
    <compiled_rule>is_simple_http_request</compiled_rule>
    <description>Ignored URLs (simple queries).</description>
   </rule>
  <rule id="31101" level="5">
    <if_sid>31100</if_sid>
    <id_pcre2>^4</id_pcre2>
    <description>Web server 400 error code.</description>
  </rule>
  <rule id="31102" level="0">
    <if_sid>31101</if_sid>
    <url_pcre2>\.jpg$|\.gif$|favicon\.ico$|\.png$|robots\.txt$|\.css$|\.js$|\.jpeg$</url_pcre2>
    <compiled_rule>is_simple_http_request</compiled_rule>
    <description>Ignored extensions on 400 error codes.</description>
  </rule>
  <rule id="31103" level="6">
    <if_sid>31100,31108</if_sid>
    <url_pcre2>=select%20|select\+|insert%20|%20from%20|%20where%20|union%20|</url_pcre2>
    <url_pcre2>union\+|where\+|null,null|xp_cmdshell</url_pcre2>
    <description>SQL injection attempt.</description>
    <group>attack,sql_injection,</group>
  </rule>
  <rule id="31104" level="6">
    <if_sid>31100</if_sid>
    <!-- Attempt to do directory transversal, simple sql injections,
      -  or access to the etc or bin directory (unix). -->
    <url_pcre2>%027|%00|%01|%7f|%2E%2E|%0A|%0D|\.\./\.\.|\.\.\\\.\.|echo;|</url_pcre2>
    <url_pcre2>cmd\.exe|root\.exe|_mem_bin|msadc|/winnt/|/boot\.ini|</url_pcre2>
    <url_pcre2>/x90/|default\.ida|/sumthin|nsiislog\.dll|chmod%|wget%|cd%20|</url_pcre2>
    <url_pcre2>exec%20|\.\./\.\.//|%5C\.\./%5C|\./\./\./\./|2e%2e%5c%2e|\\x5C\\x5C</url_pcre2>
    <description>Common web attack.</description>
    <group>attack,</group>
  </rule>
  <rule id="31105" level="6">
    <if_sid>31100</if_sid>
    <url_pcre2>%3Cscript|%3C%2Fscript|script>|script%3E|SRC=javascript|IMG%20|</url_pcre2>
    <url_pcre2>%20ONLOAD=|INPUT%20|iframe%20</url_pcre2>
    <description>XSS (Cross Site Scripting) attempt.</description>
    <group>attack,</group>
  </rule>
  <rule id="31106" level="6">
    <if_sid>31103, 31104, 31105</if_sid>
    <id_pcre2>^200</id_pcre2>
    <description>A web attack returned code 200 (success).</description>
    <group>attack,</group>
  </rule>
  <rule id="31110" level="6">
    <if_sid>31100</if_sid>
    <url_pcre2>\?-d|\?-s|\?-a|\?-b|\?-w</url_pcre2>
    <description>PHP CGI-bin vulnerability attempt.</description>
    <group>attack,</group>
  </rule>
  <rule id="31109" level="6">
    <if_sid>31100</if_sid>
    <url_pcre2>\+as\+varchar</url_pcre2>
    <pcre2>%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)</pcre2>
    <description>MSSQL Injection attempt (/ur.php, urchin.js)</description>
    <group>attack,</group>
  </rule>
  <!-- If your site have a search engine, you may need to ignore
    - it in here.
    -->
  <rule id="31107" level="0">
    <if_sid>31103, 31104, 31105</if_sid>
    <url_pcre2>^/search\.php\?search=|^/index\.php\?searchword=</url_pcre2>
    <description>Ignored URLs for the web attacks</description>
  </rule>
  <rule id="31115" level="13" maxsize="7900">
    <if_sid>31100</if_sid>
    <description>URL too long. Higher than allowed on most </description>
    <description>browsers. Possible attack.</description>
    <group>invalid_access,</group>
  </rule>
  <!-- 500 error codes, server error
    - http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
    -->
  <rule id="31120" level="5">
    <if_sid>31100</if_sid>
    <id_pcre2>^50</id_pcre2>
    <description>Web server 500 error code (server error).</description>
  </rule>
  <rule id="31121" level="4">
    <if_sid>31120</if_sid>
    <id_pcre2>^501</id_pcre2>
    <description>Web server 501 error code (Not Implemented).</description>
  </rule>
  <rule id="31122" level="5">
    <if_sid>31120</if_sid>
    <id_pcre2>^500</id_pcre2>
    <options>alert_by_email</options>
    <description>Web server 500 error code (Internal Error).</description>
    <group>system_error,</group>
  </rule>
  <rule id="31123" level="4">
    <if_sid>31120</if_sid>
    <id_pcre2>^503</id_pcre2>
    <options>alert_by_email</options>
    <description>Web server 503 error code (Service unavailable).</description>
  </rule>
  <!-- Rules to ignore crawlers -->
  <rule id="31140" level="0">
    <if_sid>31101</if_sid>
    <compiled_rule>is_valid_crawler</compiled_rule>
    <description>Ignoring google/msn/yahoo bots.</description>
  </rule>
  <!-- Ignoring nginx 499's -->
  <rule id="31141" level="0">
    <if_sid>31101</if_sid>
    <id_pcre2>^499</id_pcre2>
    <description>Ignored 499's on nginx.</description>
  </rule>
  <rule id="31151" level="10" frequency="12" timeframe="90">
    <if_matched_sid>31101</if_matched_sid>
    <same_source_ip />
    <description>Multiple web server 400 error codes </description>
    <description>from same source ip.</description>
    <group>web_scan,recon,</group>
  </rule>
  <rule id="31152" level="10" frequency="6" timeframe="120">
    <if_matched_sid>31103</if_matched_sid>
    <same_source_ip />
    <description>Multiple SQL injection attempts from same </description>
    <description>source ip.</description>
    <group>attack,sql_injection,</group>
  </rule>
  <rule id="31153" level="10" frequency="8" timeframe="120">
    <if_matched_sid>31104</if_matched_sid>
    <same_source_ip />
    <description>Multiple common web attacks from same source ip.</description>
    <group>attack,</group>
  </rule>
  <rule id="31154" level="10" frequency="8" timeframe="120">
    <if_matched_sid>31105</if_matched_sid>
    <same_source_ip />
    <description>Multiple XSS (Cross Site Scripting) attempts </description>
    <description>from same source ip.</description>
    <group>attack,</group>
  </rule>
  <rule id="31161" level="10" frequency="12" timeframe="120">
    <if_matched_sid>31121</if_matched_sid>
    <same_source_ip />
    <description>Multiple web server 501 error code (Not Implemented).</description>
    <group>web_scan,recon,</group>
  </rule>
  <rule id="31162" level="10" frequency="12" timeframe="120">
    <if_matched_sid>31122</if_matched_sid>
    <same_source_ip />
    <description>Multiple web server 500 error code (Internal Error).</description>
    <group>system_error,</group>
  </rule>
  <rule id="31163" level="10" frequency="12" timeframe="120">
    <if_matched_sid>31123</if_matched_sid>
    <same_source_ip />
    <description>Multiple web server 503 error code (Service unavailable).</description>
    <group>web_scan,recon,</group>
  </rule>
  <rule id="31164" level="6">
    <if_sid>31100</if_sid>
    <url_pcre2>=%27|select%2B|insert%2B|%2Bfrom%2B|%2Bwhere%2B|%2Bunion%2B</url_pcre2>
    <description>SQL injection attempt.</description>
    <group>attack,sqlinjection,</group>
  </rule>
  <rule id="31165" level="6">
    <if_sid>31100</if_sid>
    <url_pcre2>%EF%BC%87|%EF%BC%87|%EF%BC%87|%2531|%u0053%u0045</url_pcre2>
    <description>SQL injection attempt.</description>
    <group>attack,sqlinjection,</group>
  </rule>
 </group> <!-- Web access log -->
--- a/tests/test_core.py
+++ b/tests/test_core.py
@ -5,6 +5,7 @@ import configparser
 import pytest
 import textwrap
 import yaml
 import xml.etree.ElementTree as ET
 import jinjaturtle.core as core
 from jinjaturtle.core import (
@ -147,12 +148,15 @@ def test_formats_match_expected_extensions():
    """
    toml_path = SAMPLES_DIR / "tom.toml"
    ini_path = SAMPLES_DIR / "php.ini"
    xml_path = SAMPLES_DIR / "ossec.xml"
    fmt_toml, _ = parse_config(toml_path)
    fmt_ini, _ = parse_config(ini_path)
    fmt_xml, _ = parse_config(xml_path)
    assert fmt_toml == "toml"
    assert fmt_ini == "ini"
    assert fmt_xml == "xml"
 def test_parse_config_toml_missing_tomllib(monkeypatch):
@ -442,3 +446,210 @@ def test_fallback_str_representer_for_unknown_type():
    # It should serialize without error, and the string form should appear.
    assert "weird-value" in dumped
 def test_xml_roundtrip_ossec_web_rules():
    xml_path = SAMPLES_DIR / "ossec.xml"
    assert xml_path.is_file(), f"Missing sample XML file: {xml_path}"
    fmt, parsed = parse_config(xml_path)
    assert fmt == "xml"
    flat_items = flatten_config(fmt, parsed)
    assert flat_items, "Expected at least one flattened item from XML sample"
    defaults_yaml = generate_defaults_yaml("ossec", flat_items)
    defaults = yaml.safe_load(defaults_yaml)
    # defaults should be a non-empty dict
    assert isinstance(defaults, dict)
    assert defaults, "Expected non-empty defaults for XML sample"
    # all keys should be lowercase, start with prefix, and have no spaces
    for key in defaults:
        assert key.startswith("ossec_")
        assert key == key.lower()
        assert " " not in key
    # Root <group name="web,accesslog,"> attribute should flatten to ossec_name
    assert defaults["ossec_name"] == "web,accesslog,"
    # There should be at least one default for rule id="31100"
    id_keys = [k for k, v in defaults.items() if v == "31100"]
    assert id_keys, "Expected to find a default for rule id 31100"
    # At least one of them should be the rule *id* attribute
    assert any(
        key.startswith("ossec_rule_") and key.endswith("_id") for key in id_keys
    ), f"Expected at least one *_id var for value 31100, got: {id_keys}"
    # Template generation (preserving comments)
    original_text = xml_path.read_text(encoding="utf-8")
    template = generate_template(fmt, parsed, "ossec", original_text=original_text)
    assert isinstance(template, str)
    assert template.strip(), "Template for XML sample should not be empty"
    # Top-of-file and mid-file comments should be preserved
    assert "Official Web access rules for OSSEC." in template
    assert "Rules to ignore crawlers" in template
    # Each default variable name should appear in the template as a Jinja placeholder
    for var_name in defaults:
        assert (
            var_name in template
        ), f"Variable {var_name} not referenced in XML template"
 def test_generate_xml_template_from_text_edge_cases():
    """
    Exercise XML text edge cases:
      - XML declaration and DOCTYPE in prolog
      - top-level and inner comments
      - repeated child elements (indexing)
      - attributes and text content
    """
    text = textwrap.dedent(
        """\
        <?xml version="1.0"?>
        <!-- top comment -->
        <!DOCTYPE something>
        <root attr="1">
          <!-- inner comment -->
          <child attr="2">text</child>
          <child>other</child>
        </root>
        """
    )
    tmpl = core._generate_xml_template_from_text("role", text)
    # Prolog and comments preserved
    assert "<?xml version" in tmpl
    assert "top comment" in tmpl
    assert "inner comment" in tmpl
    # Root attribute becomes a variable (path ("@attr",) -> role_attr)
    assert "role_attr" in tmpl
    # Repeated <child> elements should be indexed in both attr and text
    assert "role_child_0_attr" in tmpl
    assert "role_child_0" in tmpl
    assert "role_child_1" in tmpl
 def test_generate_template_xml_type_error():
    """
    Wrong type for XML in generate_template should raise TypeError.
    """
    with pytest.raises(TypeError):
        generate_template("xml", parsed="not an element", role_prefix="role")
 def test_flatten_config_xml_type_error():
    """
    Wrong type for XML in flatten_config should raise TypeError.
    """
    with pytest.raises(TypeError):
        flatten_config("xml", parsed="not-an-element")
 def test_generate_template_xml_structural_fallback():
    """
    When original_text is not provided for XML, generate_template should use
    the structural fallback path (ET.tostring + _generate_xml_template_from_text).
    """
    xml_text = textwrap.dedent(
        """\
        <root attr="1">
          <child>2</child>
          <node attr="x">text</node>
        </root>
        """
    )
    parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False))
    root = ET.fromstring(xml_text, parser=parser)
    tmpl = generate_template("xml", parsed=root, role_prefix="role")
    # Root attribute path ("@attr",) -> role_attr
    assert "role_attr" in tmpl
    # Simple child element text ("child",) -> role_child
    assert "role_child" in tmpl
    # Element with both attr and text:
    #  - attr -> ("node", "@attr")  -> role_node_attr
    #  - text -> ("node", "value")  -> role_node_value
    assert "role_node_attr" in tmpl
    assert "role_node_value" in tmpl
 def test_split_xml_prolog_only_whitespace():
    """
    Whitespace-only input: prolog is the whitespace, body is empty.
    Exercises the 'if i >= n: break' path.
    """
    text = "   \n\t"
    prolog, body = core._split_xml_prolog(text)
    assert prolog == text
    assert body == ""
 def test_split_xml_prolog_unterminated_declaration():
    """
    Unterminated XML declaration should hit the 'end == -1' branch and
    treat the whole string as body.
    """
    text = "<?xml version='1.0'"
    prolog, body = core._split_xml_prolog(text)
    assert prolog == ""
    assert body == text
 def test_split_xml_prolog_unterminated_comment():
    """
    Unterminated comment should likewise hit its 'end == -1' branch.
    """
    text = "<!-- no end"
    prolog, body = core._split_xml_prolog(text)
    assert prolog == ""
    assert body == text
 def test_split_xml_prolog_unterminated_doctype():
    """
    Unterminated DOCTYPE should hit the DOCTYPE 'end == -1' branch.
    """
    text = "<!DOCTYPE foo"
    prolog, body = core._split_xml_prolog(text)
    assert prolog == ""
    assert body == text
 def test_split_xml_prolog_unexpected_content():
    """
    Non-XML content at the start should trigger the 'unexpected content'
    break and be returned entirely as body.
    """
    text = "garbage<root/>"
    prolog, body = core._split_xml_prolog(text)
    assert prolog == ""
    assert body == text
 def test_flatten_xml_text_with_attributes_uses_value_suffix():
    """
    When an element has both attributes and text, _flatten_xml should store
    the text at path + ('value',), not just path.
    """
    xml_text = "<root><node attr='x'>text</node></root>"
    parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False))
    root = ET.fromstring(xml_text, parser=parser)
    items = flatten_config("xml", root)
    # Attribute path: ("node", "@attr") -> "x"
    assert (("node", "@attr"), "x") in items
    # Text-with-attrs path: ("node", "value") -> "text"
    assert (("node", "value"), "text") in items