Add support for XML

This commit is contained in:
Miguel Jacq 2025-11-27 14:26:48 +11:00
parent 022990a337
commit 24f7dbea02
Signed by: mig5
GPG key ID: 59B3F0C24135C6A9
5 changed files with 662 additions and 6 deletions

View file

@ -25,10 +25,14 @@ stdout. However, it is possible to output the results to new files.
## What sort of config files can it handle? ## What sort of config files can it handle?
TOML, YAML, INI and JSON style config files should be okay. There are always TOML, YAML, INI, JSON and XML-style config files should be okay. There are always
going to be some edge cases in very complex files that are difficult to work going to be some edge cases in very complex files that are difficult to work
with, though, so you may still find that you need to tweak the results. with, though, so you may still find that you need to tweak the results.
The tool does not do anything intelligent like detect common sections that
could practically be turned into 'for' loops in Jinja. You'd have to do those
sorts of optimisations yourself.
The goal here is really to *speed up* converting files into Ansible/Jinja2, The goal here is really to *speed up* converting files into Ansible/Jinja2,
but not necessarily to make it perfect. but not necessarily to make it perfect.
@ -68,7 +72,7 @@ jinjaturtle php.ini \
## Full usage info ## Full usage info
``` ```
usage: jinjaturtle [-h] -r ROLE_NAME [-f {json,ini,toml,yaml}] [-d DEFAULTS_OUTPUT] [-t TEMPLATE_OUTPUT] config usage: jinjaturtle [-h] -r ROLE_NAME [-f {json,ini,toml,yaml,xml}] [-d DEFAULTS_OUTPUT] [-t TEMPLATE_OUTPUT] config
Convert a config file into an Ansible defaults file and Jinja2 template. Convert a config file into an Ansible defaults file and Jinja2 template.

View file

@ -30,7 +30,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
ap.add_argument( ap.add_argument(
"-f", "-f",
"--format", "--format",
choices=["ini", "json", "toml", "yaml"], choices=["ini", "json", "toml", "yaml", "xml"],
help="Force config format instead of auto-detecting from filename.", help="Force config format instead of auto-detecting from filename.",
) )
ap.add_argument( ap.add_argument(

View file

@ -2,9 +2,12 @@ from __future__ import annotations
import configparser import configparser
import json import json
import xml.etree.ElementTree as ET
import yaml
from collections import Counter, defaultdict
from pathlib import Path from pathlib import Path
from typing import Any, Iterable from typing import Any, Iterable
import yaml
try: try:
import tomllib # Python 3.11+ import tomllib # Python 3.11+
@ -46,7 +49,7 @@ _TurtleDumper.add_representer(None, _fallback_str_representer)
def detect_format(path: Path, explicit: str | None = None) -> str: def detect_format(path: Path, explicit: str | None = None) -> str:
""" """
Determine config format (toml, yaml, json, ini-ish) from argument or filename. Determine config format (toml, yaml, json, ini-ish, xml) from argument or filename.
""" """
if explicit: if explicit:
return explicit return explicit
@ -60,6 +63,8 @@ def detect_format(path: Path, explicit: str | None = None) -> str:
return "json" return "json"
if suffix in {".ini", ".cfg", ".conf"} or name.endswith(".ini"): if suffix in {".ini", ".cfg", ".conf"} or name.endswith(".ini"):
return "ini" return "ini"
if suffix == ".xml":
return "xml"
# Fallback: treat as INI-ish # Fallback: treat as INI-ish
return "ini" return "ini"
@ -96,9 +101,76 @@ def parse_config(path: Path, fmt: str | None = None) -> tuple[str, Any]:
parser.read_file(f) parser.read_file(f)
return fmt, parser return fmt, parser
if fmt == "xml":
# Parse XML into an ElementTree Element.
# We do NOT insert comments here so flattening stays simple.
text = path.read_text(encoding="utf-8")
parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False))
root = ET.fromstring(text, parser=parser)
return fmt, root
raise ValueError(f"Unsupported config format: {fmt}") raise ValueError(f"Unsupported config format: {fmt}")
def _flatten_xml(root: ET.Element) -> list[tuple[tuple[str, ...], Any]]:
"""
Flatten an XML tree into (path, value) pairs.
Path conventions:
- Root element's children are treated as top-level (root tag is *not* included).
- Element text:
<foo>bar</foo> -> path ("foo",) value "bar"
<foo attr="x">bar</foo> -> path ("foo", "value") value "bar"
<foo><bar>baz</bar></foo> -> ("foo", "bar") / etc.
- Attributes:
<server host="localhost">
-> path ("server", "@host") value "localhost"
- Repeated sibling elements:
<endpoint>/a</endpoint>
<endpoint>/b</endpoint>
-> ("endpoint", "0") "/a"
("endpoint", "1") "/b"
"""
items: list[tuple[tuple[str, ...], Any]] = []
def walk(elem: ET.Element, path: tuple[str, ...]) -> None:
# Attributes
for attr_name, attr_val in elem.attrib.items():
attr_path = path + (f"@{attr_name}",)
items.append((attr_path, attr_val))
# Children (exclude comments if any got in here)
children = [c for c in list(elem) if isinstance(c.tag, str)]
# Text content
text = (elem.text or "").strip()
if text:
if not elem.attrib and not children:
# Simple <foo>bar</foo>
items.append((path, text))
else:
# Text alongside attrs/children
items.append((path + ("value",), text))
# Repeated siblings get an index; singletons just use the tag
counts = Counter(child.tag for child in children)
index_counters: dict[str, int] = defaultdict(int)
for child in children:
tag = child.tag
if counts[tag] > 1:
idx = index_counters[tag]
index_counters[tag] += 1
child_path = path + (tag, str(idx))
else:
child_path = path + (tag,)
walk(child, child_path)
# Treat root as a container: its children are top-level
walk(root, ())
return items
def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]: def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]:
""" """
Flatten parsed config into a list of (path_tuple, value). Flatten parsed config into a list of (path_tuple, value).
@ -141,6 +213,12 @@ def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]:
else: else:
processed = raw processed = raw
items.append(((section, key), processed)) items.append(((section, key), processed))
elif fmt == "xml":
if not isinstance(parsed, ET.Element):
raise TypeError("XML parser result must be an Element")
items = _flatten_xml(parsed)
else: # pragma: no cover else: # pragma: no cover
raise ValueError(f"Unsupported format: {fmt}") raise ValueError(f"Unsupported format: {fmt}")
@ -677,6 +755,135 @@ def _generate_json_template(role_prefix: str, data: Any) -> str:
return json.dumps(templated, indent=2, ensure_ascii=False) + "\n" return json.dumps(templated, indent=2, ensure_ascii=False) + "\n"
def _split_xml_prolog(text: str) -> tuple[str, str]:
"""
Split an XML document into (prolog, body), where prolog includes:
- XML declaration (<?xml ...?>)
- top-level comments
- DOCTYPE
The body starts at the root element.
"""
i = 0
n = len(text)
prolog_parts: list[str] = []
while i < n:
# Preserve leading whitespace
while i < n and text[i].isspace():
prolog_parts.append(text[i])
i += 1
if i >= n:
break
if text.startswith("<?", i):
end = text.find("?>", i + 2)
if end == -1:
break
prolog_parts.append(text[i : end + 2])
i = end + 2
continue
if text.startswith("<!--", i):
end = text.find("-->", i + 4)
if end == -1:
break
prolog_parts.append(text[i : end + 3])
i = end + 3
continue
if text.startswith("<!DOCTYPE", i):
end = text.find(">", i + 9)
if end == -1:
break
prolog_parts.append(text[i : end + 1])
i = end + 1
continue
if text[i] == "<":
# Assume root element starts here
break
# Unexpected content: stop treating as prolog
break
return "".join(prolog_parts), text[i:]
def _apply_jinja_to_xml_tree(role_prefix: str, root: ET.Element) -> None:
"""
Mutate the XML tree in-place, replacing scalar values with Jinja
expressions based on the same paths used in _flatten_xml.
"""
def walk(elem: ET.Element, path: tuple[str, ...]) -> None:
# Attributes
for attr_name in list(elem.attrib.keys()):
attr_path = path + (f"@{attr_name}",)
var_name = make_var_name(role_prefix, attr_path)
elem.set(attr_name, f"{{{{ {var_name} }}}}")
# Children (exclude comments)
children = [c for c in list(elem) if isinstance(c.tag, str)]
# Text content
text = (elem.text or "").strip()
if text:
if not elem.attrib and not children:
text_path = path
else:
text_path = path + ("value",)
var_name = make_var_name(role_prefix, text_path)
elem.text = f"{{{{ {var_name} }}}}"
# Repeated children get indexes just like in _flatten_xml
counts = Counter(child.tag for child in children)
index_counters: dict[str, int] = defaultdict(int)
for child in children:
tag = child.tag
if counts[tag] > 1:
idx = index_counters[tag]
index_counters[tag] += 1
child_path = path + (tag, str(idx))
else:
child_path = path + (tag,)
walk(child, child_path)
walk(root, ())
def _generate_xml_template_from_text(role_prefix: str, text: str) -> str:
"""
Generate a Jinja2 template for an XML file, preserving comments and prolog.
- Attributes become Jinja placeholders:
<server host="localhost" />
-> <server host="{{ prefix_server_host }}" />
- Text nodes become placeholders:
<port>8080</port>
-> <port>{{ prefix_port }}</port>
but if the element also has attributes/children, the value path
gets a trailing "value" component, matching flattening.
"""
prolog, body = _split_xml_prolog(text)
# Parse with comments included so <!-- --> are preserved
parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True))
root = ET.fromstring(body, parser=parser)
_apply_jinja_to_xml_tree(role_prefix, root)
# Pretty indentation if available (Python 3.9+)
indent = getattr(ET, "indent", None)
if indent is not None:
indent(root, space=" ") # type: ignore[arg-type]
xml_body = ET.tostring(root, encoding="unicode")
return prolog + xml_body
def generate_template( def generate_template(
fmt: str, fmt: str,
parsed: Any, parsed: Any,
@ -698,11 +905,13 @@ def generate_template(
return _generate_ini_template_from_text(role_prefix, original_text) return _generate_ini_template_from_text(role_prefix, original_text)
if fmt == "yaml": if fmt == "yaml":
return _generate_yaml_template_from_text(role_prefix, original_text) return _generate_yaml_template_from_text(role_prefix, original_text)
if fmt == "xml":
return _generate_xml_template_from_text(role_prefix, original_text)
# For JSON we ignore original_text and reconstruct from parsed structure below # For JSON we ignore original_text and reconstruct from parsed structure below
if fmt != "json": if fmt != "json":
raise ValueError(f"Unsupported format: {fmt}") raise ValueError(f"Unsupported format: {fmt}")
# Fallback: previous behaviour (no comments preserved) # Fallback: no comments preserved
if fmt == "toml": if fmt == "toml":
if not isinstance(parsed, dict): if not isinstance(parsed, dict):
raise TypeError("TOML parser result must be a dict") raise TypeError("TOML parser result must be a dict")
@ -721,4 +930,11 @@ def generate_template(
if not isinstance(parsed, (dict, list)): if not isinstance(parsed, (dict, list)):
raise TypeError("JSON parser result must be a dict or list") raise TypeError("JSON parser result must be a dict or list")
return _generate_json_template(role_prefix, parsed) return _generate_json_template(role_prefix, parsed)
if fmt == "xml":
if not isinstance(parsed, ET.Element):
raise TypeError("XML parser result must be an Element")
# We don't have original_text, so comments are already lost.
# Re-serialise and run through the same templating path.
xml_str = ET.tostring(parsed, encoding="unicode")
return _generate_xml_template_from_text(role_prefix, xml_str)
raise ValueError(f"Unsupported format: {fmt}") raise ValueError(f"Unsupported format: {fmt}")

225
tests/samples/ossec.xml Normal file
View file

@ -0,0 +1,225 @@
<!-- @(#) $Id: ./etc/rules/web_rules.xml, 2013/02/28 dcid Exp $
-
- Official Web access rules for OSSEC.
-
- Copyright (C) 2009 Trend Micro Inc.
- All rights reserved.
-
- This program is a free software; you can redistribute it
- and/or modify it under the terms of the GNU General Public
- License (version 2) as published by the FSF - Free Software
- Foundation.
-
- License details: http://www.ossec.net/en/licensing.html
-->
<group name="web,accesslog,">
<rule id="31100" level="0">
<category>web-log</category>
<description>Access log messages grouped.</description>
</rule>
<rule id="31108" level="0">
<if_sid>31100</if_sid>
<id_pcre2>^2|^3</id_pcre2>
<compiled_rule>is_simple_http_request</compiled_rule>
<description>Ignored URLs (simple queries).</description>
</rule>
<rule id="31101" level="5">
<if_sid>31100</if_sid>
<id_pcre2>^4</id_pcre2>
<description>Web server 400 error code.</description>
</rule>
<rule id="31102" level="0">
<if_sid>31101</if_sid>
<url_pcre2>\.jpg$|\.gif$|favicon\.ico$|\.png$|robots\.txt$|\.css$|\.js$|\.jpeg$</url_pcre2>
<compiled_rule>is_simple_http_request</compiled_rule>
<description>Ignored extensions on 400 error codes.</description>
</rule>
<rule id="31103" level="6">
<if_sid>31100,31108</if_sid>
<url_pcre2>=select%20|select\+|insert%20|%20from%20|%20where%20|union%20|</url_pcre2>
<url_pcre2>union\+|where\+|null,null|xp_cmdshell</url_pcre2>
<description>SQL injection attempt.</description>
<group>attack,sql_injection,</group>
</rule>
<rule id="31104" level="6">
<if_sid>31100</if_sid>
<!-- Attempt to do directory transversal, simple sql injections,
- or access to the etc or bin directory (unix). -->
<url_pcre2>%027|%00|%01|%7f|%2E%2E|%0A|%0D|\.\./\.\.|\.\.\\\.\.|echo;|</url_pcre2>
<url_pcre2>cmd\.exe|root\.exe|_mem_bin|msadc|/winnt/|/boot\.ini|</url_pcre2>
<url_pcre2>/x90/|default\.ida|/sumthin|nsiislog\.dll|chmod%|wget%|cd%20|</url_pcre2>
<url_pcre2>exec%20|\.\./\.\.//|%5C\.\./%5C|\./\./\./\./|2e%2e%5c%2e|\\x5C\\x5C</url_pcre2>
<description>Common web attack.</description>
<group>attack,</group>
</rule>
<rule id="31105" level="6">
<if_sid>31100</if_sid>
<url_pcre2>%3Cscript|%3C%2Fscript|script>|script%3E|SRC=javascript|IMG%20|</url_pcre2>
<url_pcre2>%20ONLOAD=|INPUT%20|iframe%20</url_pcre2>
<description>XSS (Cross Site Scripting) attempt.</description>
<group>attack,</group>
</rule>
<rule id="31106" level="6">
<if_sid>31103, 31104, 31105</if_sid>
<id_pcre2>^200</id_pcre2>
<description>A web attack returned code 200 (success).</description>
<group>attack,</group>
</rule>
<rule id="31110" level="6">
<if_sid>31100</if_sid>
<url_pcre2>\?-d|\?-s|\?-a|\?-b|\?-w</url_pcre2>
<description>PHP CGI-bin vulnerability attempt.</description>
<group>attack,</group>
</rule>
<rule id="31109" level="6">
<if_sid>31100</if_sid>
<url_pcre2>\+as\+varchar</url_pcre2>
<pcre2>%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)%2Bchar\(\d+\)</pcre2>
<description>MSSQL Injection attempt (/ur.php, urchin.js)</description>
<group>attack,</group>
</rule>
<!-- If your site have a search engine, you may need to ignore
- it in here.
-->
<rule id="31107" level="0">
<if_sid>31103, 31104, 31105</if_sid>
<url_pcre2>^/search\.php\?search=|^/index\.php\?searchword=</url_pcre2>
<description>Ignored URLs for the web attacks</description>
</rule>
<rule id="31115" level="13" maxsize="7900">
<if_sid>31100</if_sid>
<description>URL too long. Higher than allowed on most </description>
<description>browsers. Possible attack.</description>
<group>invalid_access,</group>
</rule>
<!-- 500 error codes, server error
- http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
-->
<rule id="31120" level="5">
<if_sid>31100</if_sid>
<id_pcre2>^50</id_pcre2>
<description>Web server 500 error code (server error).</description>
</rule>
<rule id="31121" level="4">
<if_sid>31120</if_sid>
<id_pcre2>^501</id_pcre2>
<description>Web server 501 error code (Not Implemented).</description>
</rule>
<rule id="31122" level="5">
<if_sid>31120</if_sid>
<id_pcre2>^500</id_pcre2>
<options>alert_by_email</options>
<description>Web server 500 error code (Internal Error).</description>
<group>system_error,</group>
</rule>
<rule id="31123" level="4">
<if_sid>31120</if_sid>
<id_pcre2>^503</id_pcre2>
<options>alert_by_email</options>
<description>Web server 503 error code (Service unavailable).</description>
</rule>
<!-- Rules to ignore crawlers -->
<rule id="31140" level="0">
<if_sid>31101</if_sid>
<compiled_rule>is_valid_crawler</compiled_rule>
<description>Ignoring google/msn/yahoo bots.</description>
</rule>
<!-- Ignoring nginx 499's -->
<rule id="31141" level="0">
<if_sid>31101</if_sid>
<id_pcre2>^499</id_pcre2>
<description>Ignored 499's on nginx.</description>
</rule>
<rule id="31151" level="10" frequency="12" timeframe="90">
<if_matched_sid>31101</if_matched_sid>
<same_source_ip />
<description>Multiple web server 400 error codes </description>
<description>from same source ip.</description>
<group>web_scan,recon,</group>
</rule>
<rule id="31152" level="10" frequency="6" timeframe="120">
<if_matched_sid>31103</if_matched_sid>
<same_source_ip />
<description>Multiple SQL injection attempts from same </description>
<description>source ip.</description>
<group>attack,sql_injection,</group>
</rule>
<rule id="31153" level="10" frequency="8" timeframe="120">
<if_matched_sid>31104</if_matched_sid>
<same_source_ip />
<description>Multiple common web attacks from same source ip.</description>
<group>attack,</group>
</rule>
<rule id="31154" level="10" frequency="8" timeframe="120">
<if_matched_sid>31105</if_matched_sid>
<same_source_ip />
<description>Multiple XSS (Cross Site Scripting) attempts </description>
<description>from same source ip.</description>
<group>attack,</group>
</rule>
<rule id="31161" level="10" frequency="12" timeframe="120">
<if_matched_sid>31121</if_matched_sid>
<same_source_ip />
<description>Multiple web server 501 error code (Not Implemented).</description>
<group>web_scan,recon,</group>
</rule>
<rule id="31162" level="10" frequency="12" timeframe="120">
<if_matched_sid>31122</if_matched_sid>
<same_source_ip />
<description>Multiple web server 500 error code (Internal Error).</description>
<group>system_error,</group>
</rule>
<rule id="31163" level="10" frequency="12" timeframe="120">
<if_matched_sid>31123</if_matched_sid>
<same_source_ip />
<description>Multiple web server 503 error code (Service unavailable).</description>
<group>web_scan,recon,</group>
</rule>
<rule id="31164" level="6">
<if_sid>31100</if_sid>
<url_pcre2>=%27|select%2B|insert%2B|%2Bfrom%2B|%2Bwhere%2B|%2Bunion%2B</url_pcre2>
<description>SQL injection attempt.</description>
<group>attack,sqlinjection,</group>
</rule>
<rule id="31165" level="6">
<if_sid>31100</if_sid>
<url_pcre2>%EF%BC%87|%EF%BC%87|%EF%BC%87|%2531|%u0053%u0045</url_pcre2>
<description>SQL injection attempt.</description>
<group>attack,sqlinjection,</group>
</rule>
</group> <!-- Web access log -->

View file

@ -5,6 +5,7 @@ import configparser
import pytest import pytest
import textwrap import textwrap
import yaml import yaml
import xml.etree.ElementTree as ET
import jinjaturtle.core as core import jinjaturtle.core as core
from jinjaturtle.core import ( from jinjaturtle.core import (
@ -147,12 +148,15 @@ def test_formats_match_expected_extensions():
""" """
toml_path = SAMPLES_DIR / "tom.toml" toml_path = SAMPLES_DIR / "tom.toml"
ini_path = SAMPLES_DIR / "php.ini" ini_path = SAMPLES_DIR / "php.ini"
xml_path = SAMPLES_DIR / "ossec.xml"
fmt_toml, _ = parse_config(toml_path) fmt_toml, _ = parse_config(toml_path)
fmt_ini, _ = parse_config(ini_path) fmt_ini, _ = parse_config(ini_path)
fmt_xml, _ = parse_config(xml_path)
assert fmt_toml == "toml" assert fmt_toml == "toml"
assert fmt_ini == "ini" assert fmt_ini == "ini"
assert fmt_xml == "xml"
def test_parse_config_toml_missing_tomllib(monkeypatch): def test_parse_config_toml_missing_tomllib(monkeypatch):
@ -442,3 +446,210 @@ def test_fallback_str_representer_for_unknown_type():
# It should serialize without error, and the string form should appear. # It should serialize without error, and the string form should appear.
assert "weird-value" in dumped assert "weird-value" in dumped
def test_xml_roundtrip_ossec_web_rules():
xml_path = SAMPLES_DIR / "ossec.xml"
assert xml_path.is_file(), f"Missing sample XML file: {xml_path}"
fmt, parsed = parse_config(xml_path)
assert fmt == "xml"
flat_items = flatten_config(fmt, parsed)
assert flat_items, "Expected at least one flattened item from XML sample"
defaults_yaml = generate_defaults_yaml("ossec", flat_items)
defaults = yaml.safe_load(defaults_yaml)
# defaults should be a non-empty dict
assert isinstance(defaults, dict)
assert defaults, "Expected non-empty defaults for XML sample"
# all keys should be lowercase, start with prefix, and have no spaces
for key in defaults:
assert key.startswith("ossec_")
assert key == key.lower()
assert " " not in key
# Root <group name="web,accesslog,"> attribute should flatten to ossec_name
assert defaults["ossec_name"] == "web,accesslog,"
# There should be at least one default for rule id="31100"
id_keys = [k for k, v in defaults.items() if v == "31100"]
assert id_keys, "Expected to find a default for rule id 31100"
# At least one of them should be the rule *id* attribute
assert any(
key.startswith("ossec_rule_") and key.endswith("_id") for key in id_keys
), f"Expected at least one *_id var for value 31100, got: {id_keys}"
# Template generation (preserving comments)
original_text = xml_path.read_text(encoding="utf-8")
template = generate_template(fmt, parsed, "ossec", original_text=original_text)
assert isinstance(template, str)
assert template.strip(), "Template for XML sample should not be empty"
# Top-of-file and mid-file comments should be preserved
assert "Official Web access rules for OSSEC." in template
assert "Rules to ignore crawlers" in template
# Each default variable name should appear in the template as a Jinja placeholder
for var_name in defaults:
assert (
var_name in template
), f"Variable {var_name} not referenced in XML template"
def test_generate_xml_template_from_text_edge_cases():
"""
Exercise XML text edge cases:
- XML declaration and DOCTYPE in prolog
- top-level and inner comments
- repeated child elements (indexing)
- attributes and text content
"""
text = textwrap.dedent(
"""\
<?xml version="1.0"?>
<!-- top comment -->
<!DOCTYPE something>
<root attr="1">
<!-- inner comment -->
<child attr="2">text</child>
<child>other</child>
</root>
"""
)
tmpl = core._generate_xml_template_from_text("role", text)
# Prolog and comments preserved
assert "<?xml version" in tmpl
assert "top comment" in tmpl
assert "inner comment" in tmpl
# Root attribute becomes a variable (path ("@attr",) -> role_attr)
assert "role_attr" in tmpl
# Repeated <child> elements should be indexed in both attr and text
assert "role_child_0_attr" in tmpl
assert "role_child_0" in tmpl
assert "role_child_1" in tmpl
def test_generate_template_xml_type_error():
"""
Wrong type for XML in generate_template should raise TypeError.
"""
with pytest.raises(TypeError):
generate_template("xml", parsed="not an element", role_prefix="role")
def test_flatten_config_xml_type_error():
"""
Wrong type for XML in flatten_config should raise TypeError.
"""
with pytest.raises(TypeError):
flatten_config("xml", parsed="not-an-element")
def test_generate_template_xml_structural_fallback():
"""
When original_text is not provided for XML, generate_template should use
the structural fallback path (ET.tostring + _generate_xml_template_from_text).
"""
xml_text = textwrap.dedent(
"""\
<root attr="1">
<child>2</child>
<node attr="x">text</node>
</root>
"""
)
parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False))
root = ET.fromstring(xml_text, parser=parser)
tmpl = generate_template("xml", parsed=root, role_prefix="role")
# Root attribute path ("@attr",) -> role_attr
assert "role_attr" in tmpl
# Simple child element text ("child",) -> role_child
assert "role_child" in tmpl
# Element with both attr and text:
# - attr -> ("node", "@attr") -> role_node_attr
# - text -> ("node", "value") -> role_node_value
assert "role_node_attr" in tmpl
assert "role_node_value" in tmpl
def test_split_xml_prolog_only_whitespace():
"""
Whitespace-only input: prolog is the whitespace, body is empty.
Exercises the 'if i >= n: break' path.
"""
text = " \n\t"
prolog, body = core._split_xml_prolog(text)
assert prolog == text
assert body == ""
def test_split_xml_prolog_unterminated_declaration():
"""
Unterminated XML declaration should hit the 'end == -1' branch and
treat the whole string as body.
"""
text = "<?xml version='1.0'"
prolog, body = core._split_xml_prolog(text)
assert prolog == ""
assert body == text
def test_split_xml_prolog_unterminated_comment():
"""
Unterminated comment should likewise hit its 'end == -1' branch.
"""
text = "<!-- no end"
prolog, body = core._split_xml_prolog(text)
assert prolog == ""
assert body == text
def test_split_xml_prolog_unterminated_doctype():
"""
Unterminated DOCTYPE should hit the DOCTYPE 'end == -1' branch.
"""
text = "<!DOCTYPE foo"
prolog, body = core._split_xml_prolog(text)
assert prolog == ""
assert body == text
def test_split_xml_prolog_unexpected_content():
"""
Non-XML content at the start should trigger the 'unexpected content'
break and be returned entirely as body.
"""
text = "garbage<root/>"
prolog, body = core._split_xml_prolog(text)
assert prolog == ""
assert body == text
def test_flatten_xml_text_with_attributes_uses_value_suffix():
"""
When an element has both attributes and text, _flatten_xml should store
the text at path + ('value',), not just path.
"""
xml_text = "<root><node attr='x'>text</node></root>"
parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False))
root = ET.fromstring(xml_text, parser=parser)
items = flatten_config("xml", root)
# Attribute path: ("node", "@attr") -> "x"
assert (("node", "@attr"), "x") in items
# Text-with-attrs path: ("node", "value") -> "text"
assert (("node", "value"), "text") in items