Add export options

2025-11-02 12:49:19 +11:00 · 2025-11-02 12:49:19 +11:00 · fb4a9e5e27
commit fb4a9e5e27
parent 6cae652643
4 changed files with 171 additions and 13 deletions
--- a/bouquin/db.py
+++ b/bouquin/db.py
@ -1,9 +1,16 @@
 from __future__ import annotations

+import csv
+import html
+import json
+import os
+
 from dataclasses import dataclass
 from pathlib import Path
-
 from sqlcipher3 import dbapi2 as sqlite
+from typing import List, Sequence, Tuple
+
+Entry = Tuple[str, str]


@dataclass
@ -21,6 +28,7 @@ class DBManager:
        # Ensure parent dir exists
        self.cfg.path.parent.mkdir(parents=True, exist_ok=True)
        self.conn = sqlite.connect(str(self.cfg.path))
+        self.conn.row_factory = sqlite.Row
        cur = self.conn.cursor()
        cur.execute(f"PRAGMA key = '{self.cfg.key}';")
        cur.execute("PRAGMA journal_mode = WAL;")
@ -102,14 +110,116 @@ class DBManager:
    def search_entries(self, text: str) -> list[str]:
        cur = self.conn.cursor()
        pattern = f"%{text}%"
-        cur.execute("SELECT * FROM entries WHERE TRIM(content) LIKE ?", (pattern,))
-        return [r for r in cur.fetchall()]
+        return cur.execute(
+            "SELECT * FROM entries WHERE TRIM(content) LIKE ?", (pattern,)
+        ).fetchall()

    def dates_with_content(self) -> list[str]:
        cur = self.conn.cursor()
        cur.execute("SELECT date FROM entries WHERE TRIM(content) <> '';")
        return [r[0] for r in cur.fetchall()]

+    def get_all_entries(self) -> List[Entry]:
+        cur = self.conn.cursor()
+        rows = cur.execute("SELECT date, content FROM entries ORDER BY date").fetchall()
+        return [(row["date"], row["content"]) for row in rows]
+
+    def export_json(
+        self, entries: Sequence[Entry], file_path: str, pretty: bool = True
+    ) -> None:
+        data = [{"date": d, "content": c} for d, c in entries]
+        with open(file_path, "w", encoding="utf-8") as f:
+            if pretty:
+                json.dump(data, f, ensure_ascii=False, indent=2)
+            else:
+                json.dump(data, f, ensure_ascii=False, separators=(",", ":"))
+
+    def export_csv(self, entries: Sequence[Entry], file_path: str) -> None:
+        # utf-8-sig adds a BOM so Excel opens as UTF-8 by default.
+        with open(file_path, "w", encoding="utf-8-sig", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow(["date", "content"])  # header
+            writer.writerows(entries)
+
+    def export_txt(
+        self,
+        entries: Sequence[Entry],
+        file_path: str,
+        separator: str = "\n\n— — — — —\n\n",
+        strip_html: bool = True,
+    ) -> None:
+        import re, html as _html
+
+        # Precompiled patterns
+        STYLE_SCRIPT_RE = re.compile(r"(?is)<(script|style)[^>]*>.*?</\1>")
+        COMMENT_RE = re.compile(r"<!--.*?-->", re.S)
+        BR_RE = re.compile(r"(?i)<br\\s*/?>")
+        BLOCK_END_RE = re.compile(r"(?i)</(p|div|section|article|li|h[1-6])\\s*>")
+        TAG_RE = re.compile(r"<[^>]+>")
+        WS_ENDS_RE = re.compile(r"[ \\t]+\\n")
+        MULTINEWLINE_RE = re.compile(r"\\n{3,}")
+
+        def _strip(s: str) -> str:
+            # 1) Remove <style> and <script> blocks *including their contents*
+            s = STYLE_SCRIPT_RE.sub("", s)
+            # 2) Remove HTML comments
+            s = COMMENT_RE.sub("", s)
+            # 3) Turn some block-ish boundaries into newlines before removing tags
+            s = BR_RE.sub("\n", s)
+            s = BLOCK_END_RE.sub("\n", s)
+            # 4) Drop remaining tags
+            s = TAG_RE.sub("", s)
+            # 5) Unescape entities (&nbsp; etc.)
+            s = _html.unescape(s)
+            # 6) Tidy whitespace
+            s = WS_ENDS_RE.sub("\n", s)
+            s = MULTINEWLINE_RE.sub("\n\n", s)
+            return s.strip()
+
+        with open(file_path, "w", encoding="utf-8") as f:
+            for i, (d, c) in enumerate(entries):
+                body = _strip(c) if strip_html else c
+                f.write(f"{d}\n{body}\n")
+                if i < len(entries) - 1:
+                    f.write(separator)
+
+    def export_html(
+        self, entries: Sequence[Entry], file_path: str, title: str = "Entries export"
+    ) -> None:
+        parts = [
+            "<!doctype html>",
+            '<html lang="en">',
+            '<meta charset="utf-8">',
+            f"<title>{html.escape(title)}</title>",
+            "<style>body{font:16px/1.5 system-ui,Segoe UI,Roboto,Helvetica,Arial,sans-serif;padding:24px;max-width:900px;margin:auto;}",
+            "article{padding:16px 0;border-bottom:1px solid #ddd;} time{font-weight:600;color:#333;} section{margin-top:8px;}</style>",
+            "<body>",
+            f"<h1>{html.escape(title)}</h1>",
+        ]
+        for d, c in entries:
+            parts.append(
+                f"<article><header><time>{html.escape(d)}</time></header><section>{c}</section></article>"
+            )
+        parts.append("</body></html>")
+
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write("\n".join(parts))
+
+    def export_by_extension(self, file_path: str) -> None:
+        entries = self.get_all_entries()
+        ext = os.path.splitext(file_path)[1].lower()
+
+        if ext == ".json":
+            self.export_json(entries, file_path)
+        elif ext == ".csv":
+            self.export_csv(entries, file_path)
+        elif ext == ".txt":
+            self.export_txt(entries, file_path)
+        elif ext in {".html", ".htm"}:
+            self.export_html(entries, file_path)
+        else:
+            raise ValueError(f"Unsupported extension: {ext}")
+
    def close(self) -> None:
        if self.conn is not None:
            self.conn.close()