Add support for XML

This commit is contained in:
Miguel Jacq 2025-11-27 14:26:48 +11:00
parent 022990a337
commit 24f7dbea02
Signed by: mig5
GPG key ID: 59B3F0C24135C6A9
5 changed files with 662 additions and 6 deletions

View file

@ -30,7 +30,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
ap.add_argument(
"-f",
"--format",
choices=["ini", "json", "toml", "yaml"],
choices=["ini", "json", "toml", "yaml", "xml"],
help="Force config format instead of auto-detecting from filename.",
)
ap.add_argument(

View file

@ -2,9 +2,12 @@ from __future__ import annotations
import configparser
import json
import xml.etree.ElementTree as ET
import yaml
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any, Iterable
import yaml
try:
import tomllib # Python 3.11+
@ -46,7 +49,7 @@ _TurtleDumper.add_representer(None, _fallback_str_representer)
def detect_format(path: Path, explicit: str | None = None) -> str:
"""
Determine config format (toml, yaml, json, ini-ish) from argument or filename.
Determine config format (toml, yaml, json, ini-ish, xml) from argument or filename.
"""
if explicit:
return explicit
@ -60,6 +63,8 @@ def detect_format(path: Path, explicit: str | None = None) -> str:
return "json"
if suffix in {".ini", ".cfg", ".conf"} or name.endswith(".ini"):
return "ini"
if suffix == ".xml":
return "xml"
# Fallback: treat as INI-ish
return "ini"
@ -96,9 +101,76 @@ def parse_config(path: Path, fmt: str | None = None) -> tuple[str, Any]:
parser.read_file(f)
return fmt, parser
if fmt == "xml":
# Parse XML into an ElementTree Element.
# We do NOT insert comments here so flattening stays simple.
text = path.read_text(encoding="utf-8")
parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=False))
root = ET.fromstring(text, parser=parser)
return fmt, root
raise ValueError(f"Unsupported config format: {fmt}")
def _flatten_xml(root: ET.Element) -> list[tuple[tuple[str, ...], Any]]:
"""
Flatten an XML tree into (path, value) pairs.
Path conventions:
- Root element's children are treated as top-level (root tag is *not* included).
- Element text:
<foo>bar</foo> -> path ("foo",) value "bar"
<foo attr="x">bar</foo> -> path ("foo", "value") value "bar"
<foo><bar>baz</bar></foo> -> ("foo", "bar") / etc.
- Attributes:
<server host="localhost">
-> path ("server", "@host") value "localhost"
- Repeated sibling elements:
<endpoint>/a</endpoint>
<endpoint>/b</endpoint>
-> ("endpoint", "0") "/a"
("endpoint", "1") "/b"
"""
items: list[tuple[tuple[str, ...], Any]] = []
def walk(elem: ET.Element, path: tuple[str, ...]) -> None:
# Attributes
for attr_name, attr_val in elem.attrib.items():
attr_path = path + (f"@{attr_name}",)
items.append((attr_path, attr_val))
# Children (exclude comments if any got in here)
children = [c for c in list(elem) if isinstance(c.tag, str)]
# Text content
text = (elem.text or "").strip()
if text:
if not elem.attrib and not children:
# Simple <foo>bar</foo>
items.append((path, text))
else:
# Text alongside attrs/children
items.append((path + ("value",), text))
# Repeated siblings get an index; singletons just use the tag
counts = Counter(child.tag for child in children)
index_counters: dict[str, int] = defaultdict(int)
for child in children:
tag = child.tag
if counts[tag] > 1:
idx = index_counters[tag]
index_counters[tag] += 1
child_path = path + (tag, str(idx))
else:
child_path = path + (tag,)
walk(child, child_path)
# Treat root as a container: its children are top-level
walk(root, ())
return items
def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]:
"""
Flatten parsed config into a list of (path_tuple, value).
@ -141,6 +213,12 @@ def flatten_config(fmt: str, parsed: Any) -> list[tuple[tuple[str, ...], Any]]:
else:
processed = raw
items.append(((section, key), processed))
elif fmt == "xml":
if not isinstance(parsed, ET.Element):
raise TypeError("XML parser result must be an Element")
items = _flatten_xml(parsed)
else: # pragma: no cover
raise ValueError(f"Unsupported format: {fmt}")
@ -677,6 +755,135 @@ def _generate_json_template(role_prefix: str, data: Any) -> str:
return json.dumps(templated, indent=2, ensure_ascii=False) + "\n"
def _split_xml_prolog(text: str) -> tuple[str, str]:
"""
Split an XML document into (prolog, body), where prolog includes:
- XML declaration (<?xml ...?>)
- top-level comments
- DOCTYPE
The body starts at the root element.
"""
i = 0
n = len(text)
prolog_parts: list[str] = []
while i < n:
# Preserve leading whitespace
while i < n and text[i].isspace():
prolog_parts.append(text[i])
i += 1
if i >= n:
break
if text.startswith("<?", i):
end = text.find("?>", i + 2)
if end == -1:
break
prolog_parts.append(text[i : end + 2])
i = end + 2
continue
if text.startswith("<!--", i):
end = text.find("-->", i + 4)
if end == -1:
break
prolog_parts.append(text[i : end + 3])
i = end + 3
continue
if text.startswith("<!DOCTYPE", i):
end = text.find(">", i + 9)
if end == -1:
break
prolog_parts.append(text[i : end + 1])
i = end + 1
continue
if text[i] == "<":
# Assume root element starts here
break
# Unexpected content: stop treating as prolog
break
return "".join(prolog_parts), text[i:]
def _apply_jinja_to_xml_tree(role_prefix: str, root: ET.Element) -> None:
"""
Mutate the XML tree in-place, replacing scalar values with Jinja
expressions based on the same paths used in _flatten_xml.
"""
def walk(elem: ET.Element, path: tuple[str, ...]) -> None:
# Attributes
for attr_name in list(elem.attrib.keys()):
attr_path = path + (f"@{attr_name}",)
var_name = make_var_name(role_prefix, attr_path)
elem.set(attr_name, f"{{{{ {var_name} }}}}")
# Children (exclude comments)
children = [c for c in list(elem) if isinstance(c.tag, str)]
# Text content
text = (elem.text or "").strip()
if text:
if not elem.attrib and not children:
text_path = path
else:
text_path = path + ("value",)
var_name = make_var_name(role_prefix, text_path)
elem.text = f"{{{{ {var_name} }}}}"
# Repeated children get indexes just like in _flatten_xml
counts = Counter(child.tag for child in children)
index_counters: dict[str, int] = defaultdict(int)
for child in children:
tag = child.tag
if counts[tag] > 1:
idx = index_counters[tag]
index_counters[tag] += 1
child_path = path + (tag, str(idx))
else:
child_path = path + (tag,)
walk(child, child_path)
walk(root, ())
def _generate_xml_template_from_text(role_prefix: str, text: str) -> str:
"""
Generate a Jinja2 template for an XML file, preserving comments and prolog.
- Attributes become Jinja placeholders:
<server host="localhost" />
-> <server host="{{ prefix_server_host }}" />
- Text nodes become placeholders:
<port>8080</port>
-> <port>{{ prefix_port }}</port>
but if the element also has attributes/children, the value path
gets a trailing "value" component, matching flattening.
"""
prolog, body = _split_xml_prolog(text)
# Parse with comments included so <!-- --> are preserved
parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True))
root = ET.fromstring(body, parser=parser)
_apply_jinja_to_xml_tree(role_prefix, root)
# Pretty indentation if available (Python 3.9+)
indent = getattr(ET, "indent", None)
if indent is not None:
indent(root, space=" ") # type: ignore[arg-type]
xml_body = ET.tostring(root, encoding="unicode")
return prolog + xml_body
def generate_template(
fmt: str,
parsed: Any,
@ -698,11 +905,13 @@ def generate_template(
return _generate_ini_template_from_text(role_prefix, original_text)
if fmt == "yaml":
return _generate_yaml_template_from_text(role_prefix, original_text)
if fmt == "xml":
return _generate_xml_template_from_text(role_prefix, original_text)
# For JSON we ignore original_text and reconstruct from parsed structure below
if fmt != "json":
raise ValueError(f"Unsupported format: {fmt}")
# Fallback: previous behaviour (no comments preserved)
# Fallback: no comments preserved
if fmt == "toml":
if not isinstance(parsed, dict):
raise TypeError("TOML parser result must be a dict")
@ -721,4 +930,11 @@ def generate_template(
if not isinstance(parsed, (dict, list)):
raise TypeError("JSON parser result must be a dict or list")
return _generate_json_template(role_prefix, parsed)
if fmt == "xml":
if not isinstance(parsed, ET.Element):
raise TypeError("XML parser result must be an Element")
# We don't have original_text, so comments are already lost.
# Re-serialise and run through the same templating path.
xml_str = ET.tostring(parsed, encoding="unicode")
return _generate_xml_template_from_text(role_prefix, xml_str)
raise ValueError(f"Unsupported format: {fmt}")