]> git.mar77i.info Git - mar77i.info/commitdiff
add sitemap master
authormar77i <mar77i@protonmail.ch>
Mon, 24 Mar 2025 21:39:01 +0000 (22:39 +0100)
committermar77i <mar77i@protonmail.ch>
Mon, 24 Mar 2025 21:39:01 +0000 (22:39 +0100)
content/blog/index.md [new file with mode: 0644]
content/nav.md
generate.py
md.py
post-receive.sh
requirements.txt [new file with mode: 0644]
template.html
utils.py [deleted file]

diff --git a/content/blog/index.md b/content/blog/index.md
new file mode 100644 (file)
index 0000000..bc17520
--- /dev/null
@@ -0,0 +1,11 @@
+# Blog
+
+Blog overview, by [date](#by_date), or by [hashtags](#by_hashtags)
+
+## {id=by_date}Index
+
+blog-index
+
+## {id=by_hashtags}Hashtags
+
+hashtags
index 6a7508641b2a14950032edac26789855c0e7139f..3e48e901a21009fadc576f97ea88c6cc36834ae5 100644 (file)
@@ -1,3 +1,3 @@
 - [Home](/)
-- [Blog](/blog)
+- [Blog](/blog/)
 - [Git](https://git.mar77i.info/)
index 53301ab6857ba475733838d4f066bc6a58d5909d..4f48f5f438f80db0759b867bf3f4d115cd8ca93b 100755 (executable)
 #!/usr/bin/env python3
 
+import os
 import shutil
+import sys
+import xmlschema
 from argparse import ArgumentParser
+from contextlib import contextmanager
+from datetime import datetime, UTC
+from functools import cached_property
+from hashlib import sha256
+from itertools import chain
 from pathlib import Path
+from shutil import rmtree
+from xml.etree import ElementTree
 
 from md import MDRenderer
-from utils import get_content, write_content
+
+
+def path_len_parts(p):
+    return len(p.parts)
+
+
+@contextmanager
+def cleanup_existing_output(output_path):
+    existing_output = []
+    if not output_path.exists():
+        output_path.mkdir(0o755, True, True)
+    else:
+        for current, *dirs_and_files in os.walk(output_path):
+            current_path = Path(current)
+            existing_output.extend(
+                current_path / f for f in chain.from_iterable(dirs_and_files)
+            )
+    yield existing_output
+    for path in sorted(existing_output, key=path_len_parts, reverse=True):
+        is_dir = path.is_dir()
+        print(f"deleting {str(path)}{'/' if is_dir else ''}")
+        if is_dir:
+            path.rmdir()
+        else:
+            path.unlink()
+
+
+class WebsiteGenerator:
+    STATIC_FILES = [
+        "style.css",
+    ]
+    SITEMAP_NAMESPACE = "http://www.sitemaps.org/schemas/sitemap/0.9"
+    SITEMAP_SCHEMA_URL = "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
+
+    def __init__(self, base_url, build_path, output_path):
+        self.base_url = base_url
+        if build_path.exists():
+            shutil.rmtree(build_path)
+        build_path.mkdir(0o755, True)
+        self.build_path = build_path
+        self.output_path = output_path
+        self.source_path = Path(__file__).parent
+        with (self.source_path / "template.html").open("rt") as fh:
+            self.template = fh.read()
+        self.files_and_urls = []
+
+    @cached_property
+    def content_path(self):
+        return self.source_path / "content"
+
+    AutoContent = type("AutoContentType", (), {})()
+
+    def add_path(self, in_path, out_path, content=None, url=None):
+        if out_path in (x[0] for x in self.files_and_urls):
+            raise ValueError("Cannot add the same file multiple times")
+        self.files_and_urls.append((out_path, url))
+        if in_path.is_dir():
+            print(f"creating {str(out_path.relative_to(self.build_path))}/")
+            out_path.mkdir(0o755)
+        elif content is not None:
+            if content is self.AutoContent:
+                with in_path.open("rb") as fh:
+                    content = fh.read()
+            print(f"writing {str(out_path.relative_to(self.build_path))}")
+            with out_path.open("wb" if isinstance(content, bytes) else "wt") as fh:
+                fh.write(content)
+        else:
+            raise ValueError("No content provided.")
+
+    def render_page(self, nav, in_path, url):
+        with in_path.open("rt") as fh:
+            renderer = MDRenderer(fh.read(), url)
+        return self.template.format(
+            nav=nav,
+            page=renderer.render_html(),
+            pagemeta=renderer.render_html_pagemeta(),
+        ), url
+
+    def get_url(self, out_path):
+        url = f"{self.base_url}{out_path.relative_to(self.build_path)}"
+        if url.endswith("/index.html"):
+            url = url[:-len("index.html")]
+        return url
+
+    def build(self):
+        print("==> building", self.base_url)
+        with (self.content_path / "nav.md").open("rt") as fh:
+            nav = MDRenderer(fh.read()).render_html()
+        index_md = self.content_path / "index.md"
+        index_html = self.build_path / "index.html"
+        self.add_path(
+            index_md,
+            index_html,
+            *self.render_page(nav, index_md, self.get_url(index_html)),
+        )
+        blog_md = self.content_path / "blog" / "index.md"
+        blog_html = self.build_path / "blog" / "index.html"
+        self.add_path(blog_md.parent, blog_html.parent)
+        self.add_path(
+            blog_md,
+            blog_html,
+            *self.render_page(nav, blog_md, self.get_url(blog_html)),
+        )
+        for static_file in self.STATIC_FILES:
+            self.add_path(
+                self.source_path / static_file,
+                self.build_path / static_file,
+                self.AutoContent,
+            )
+
+    def sync(self):
+        print("==> syncing", self.base_url)
+        with cleanup_existing_output(self.output_path) as existing_output:
+            sitemap = []
+            for src_path, url in self.files_and_urls:
+                rel_path = src_path.relative_to(self.build_path)
+                dest_path = self.output_path / rel_path
+                if dest_path in existing_output:
+                    existing_output.remove(dest_path)
+                if src_path.is_dir():
+                    if not dest_path.exists():
+                        print("creating", str(rel_path))
+                        dest_path.mkdir(0o755)
+                    continue
+                update = None if dest_path.exists() else "creating"
+                with src_path.open("rb") as fh:
+                    src_content = fh.read()
+                if update is None:
+                    with dest_path.open("rb") as fh:
+                        if sha256(src_content).digest() != sha256(fh.read()).digest():
+                            update = "updating"
+                if update is not None:
+                    print(update, rel_path)
+                    with dest_path.open("wb") as out_fh:
+                        out_fh.write(src_content)
+                if url is None:
+                    continue
+                sitemap.append(
+                    {
+                        "loc": url,
+                        "lastmod": datetime.fromtimestamp(
+                            dest_path.stat().st_mtime,
+                            UTC,
+                        ).isoformat(timespec="seconds"),
+                    }
+                )
+            self.generate_sitemap(
+                self.output_path / "sitemap.xml", sitemap, existing_output
+            )
+
+    @classmethod
+    def generate_sitemap(cls, sitemap_xml, urls, existing_output):
+        if sitemap_xml in existing_output:
+            existing_output.remove(sitemap_xml)
+        schema = xmlschema.XMLSchema(cls.SITEMAP_SCHEMA_URL)
+        with open(sitemap_xml, "wb") as fh:
+            ElementTree.register_namespace("", cls.SITEMAP_NAMESPACE)
+            fh.write(b"<?xml version='1.0' encoding='UTF-8'?>\n")
+            fh.write(
+                ElementTree.tostring(
+                    schema.encode(
+                        {
+                            "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
+                            "@xmlns": cls.SITEMAP_NAMESPACE,
+                            "@xsi:schemaLocation": (
+                                f"{cls.SITEMAP_NAMESPACE} {cls.SITEMAP_SCHEMA_URL}"
+                            ),
+                            "url": urls,
+                        }
+                    )
+                )
+            )
+        print(f"Validating XML {repr(sitemap_xml.name)}...", end=" ")
+        sys.stdout.flush()
+        schema.validate(sitemap_xml)
+        print("done")
+
+    def cleanup(self):
+        rmtree(self.build_path)
 
 
 def main():
     ap = ArgumentParser()
+    ap.add_argument("--base-url", default="https://www.mar77i.info/")
+    ap.add_argument("--build-dir", default="/dev/shm/build")
     ap.add_argument("--output-dir", default="/dev/shm/output")
     args = ap.parse_args()
-    source_path = Path(__file__).parent
-    content_path = source_path / "content"
-    output_path = Path(args.output_dir)
-    if output_path.exists():
-        shutil.rmtree(output_path)
-    output_path.mkdir(0o755)
-    template = get_content(source_path / "template.html")
-    context = {
-        "nav": MDRenderer(get_content(content_path / "nav.md")).render_html()[1],
-    }
-    context["title"], context["page"] = MDRenderer(
-        get_content(content_path / "index.md")
-    ).render_html()
-    write_content(output_path / "index.html", template.format(**context))
-
-    #context["blogs"] = []
-    #context["hashtags"] = []
-    #for file in (content_path / "blog").iterdir():
-    #    if file.name == "index.md" or not file.name.endswith(".md"):
-    #        continue
-    #    context["blogs"].append(f"blog/{file.name[:-3]}.html")
-    #    context["title"], context["page"] = MDRenderer(get_content(file)).render_html()
-    #    write_content(
-    #        blog_path / f"{file.name[:-3]}.html", template.render(context),
-    #    )
-    #context["title"], context["page"] = MDRenderer(
-    #    get_content(content_path / "blog" / "index.md")
-    #).render_html()
-
-    blog_path = output_path / "blog"
-    blog_path.mkdir(0o755)
-    context["title"], context["page"] = "Blog stub", "<h2>Blog stub</h2>"
-    write_content(blog_path / "index.html", template.format(**context))
-    write_content(output_path / "style.css", get_content(source_path / "style.css"))
+    website_gen = WebsiteGenerator(
+        args.base_url, Path(args.build_dir), Path(args.output_dir)
+    )
+    website_gen.build()
+    website_gen.sync()
+    website_gen.cleanup()
 
 
 if __name__ == "__main__":
diff --git a/md.py b/md.py
index 2ae5408e0c93f08161b46ba564059e9de30a4020..ebe700e2887fafd4b366117da2518f2ea65728e3 100644 (file)
--- a/md.py
+++ b/md.py
@@ -1,5 +1,7 @@
 import os
 from io import StringIO
+from html import escape
+from urllib.parse import quote_plus
 
 _registered_tags = []
 
@@ -89,10 +91,11 @@ class MDLineTag:
 
     def __init__(self):
         self.lines = []
-        self.content = ""
         self.sio = StringIO()
+
         self.end_backslash = None
         self.backslash = False
+        self.attributes = []
         self.states = []
 
     def check_states(self, tag_class):
@@ -105,14 +108,15 @@ class MDLineTag:
             return links[-1].text is None or links[-1].start is None
         return True
 
-    def handle_backslash(self, i, c):
-        if i == self.end_backslash:
-            self.end_backslash = None
-            self.backslash = False
-            return False
-        elif self.end_backslash is not None:
-            pass
-        elif self.content[i:i + len(os.linesep)] == os.linesep:
+    def handle_backslash(self, content, i, c):
+        if self.end_backslash is not None:
+            done = i == self.end_backslash
+            if done:
+                self.end_backslash = None
+                self.backslash = False
+            # continue only if we're not at end_backslash
+            return not done
+        if content[i:i + len(os.linesep)] == os.linesep:
             self.sio.write(" ")
             self.end_backslash = i + len(os.linesep)
         else:
@@ -120,32 +124,85 @@ class MDLineTag:
             self.backslash = False
         return True
 
+#    def handle_attributes(self, c):
+#        if c not in " }=":
+#            return False
+#        self.sio.seek(self.attribute_since)
+#        value = self.sio.read()
+#        self.sio.seek(self.attribute_since)
+#        self.sio.truncate()
+#
+#        if c == "=":
+#            if self.attribute_key is not None:
+#                raise AttributeError("Key already specified!")
+#            self.attribute_key = value
+#            return True
+#        if c == "}":
+#            self.attribute_since = None
+#        return True
+
     def handle_tag(self, c):
         for tag_class in _registered_tags:
             if c == tag_class.char and self.check_states(tag_class):
                 return tag_class(self.states, self.sio)
         return None
 
+    def maybe_get_attributes(self, content):
+        if not content.startswith("{"):
+            return content
+        sio = StringIO()
+        content = content[1:]
+        backslash = False
+        key_pos = None
+        for i, c in enumerate(content):
+            if backslash:
+                sio.write(c)
+                continue
+            if c in " }":
+                value = sio.getvalue()
+                if key_pos is None:
+                    item = value, ""
+                else:
+                    item = value[:key_pos], value[key_pos:]
+                self.attributes.append(item)
+                sio.seek(0)
+                sio.truncate()
+                if c == "}":
+                    return content[i + 1:]
+                key_pos = None
+                continue
+            elif c == "\\":
+                backslash = True
+                continue
+            elif c == "=":
+                key_pos = sio.tell()
+                continue
+            sio.write(c)
+        raise ValueError("Attribute list: missing closing '}'")
+
     def render_inner(self):
         if self.sio.getvalue():
-            assert not self.lines
+            assert self.lines is None
             return self.sio.getvalue()
-        self.content = os.linesep.join(self.lines)
-        for i, c in enumerate(self.content):
-            if self.backslash and self.handle_backslash(i, c):
+        content = self.maybe_get_attributes(os.linesep.join(self.lines))
+        for i, c in enumerate(content):
+            if self.backslash and self.handle_backslash(content, i, c):
                 continue
             if c == "\\":
                 self.backslash = True
-                continue
-            if not self.handle_tag(c):
+            elif not self.handle_tag(c):
                 (self.states[-1] if self.states else self.sio).write(c)
         assert len(self.states) == 0, self.states
         assert not self.backslash
-        self.lines.clear()
+        self.lines = None
         return self.sio.getvalue()
 
     def render_outer(self):
-        return f"<{self.name}>{self.render_inner()}</{self.name}>"
+        inner_html = self.render_inner()
+        sio = StringIO()
+        for key, value in self.attributes:
+            sio.write(f' {key}="{escape(value)}"' if value else f" {key}")
+        return f"<{self.name}{sio.getvalue()}>{inner_html}</{self.name}>"
 
 
 class Paragraph(MDLineTag):
@@ -156,6 +213,10 @@ class Heading2(MDLineTag):
     name = "h2"
 
 
+class Heading3(MDLineTag):
+    name = "h3"
+
+
 class BulletList(MDLineTag):
     name = "ul"
 
@@ -182,8 +243,9 @@ class MDRenderer:
     """
     Simplified markdown to html translator.
     """
-    def __init__(self, page):
+    def __init__(self, page, url=""):
         self.page = page
+        self.url = url
         self.sio = StringIO()
         self.tag = None
         self.tags = []
@@ -195,14 +257,36 @@ class MDRenderer:
         self.tags.append(self.tag)
         return True
 
+    def render_html_pagemeta(self):
+        """
+            <link rel="canonical" href="{canonical_url}">
+            <meta name="title" content="{title}">
+            <meta name="description" content="{description}">
+            <title>mar77i.info ¬ {title}</title>
+        """
+        assert isinstance(self.tags[0], Heading2)
+        title = escape(self.tags[0].render_inner())
+        if isinstance(self.tags[1], Paragraph):
+            description = self.tags[1].render_inner()
+        else:
+            description = ""
+        return os.linesep.join(
+            (
+                f'<link rel="canonical" href="{quote_plus(self.url, "/:")}">',
+                f'<meta name="title" content="{escape(title)}">',
+                f'<meta name="description" content="{escape(description)}">',
+                f"<title>mar77i.info ¬ {title}</title>",
+            )
+        )
+
     def render_html(self):
-        title = None
         for line in self.page.split(os.linesep):
             if line.startswith("# "):
                 self.set_tag(Heading2)
-                if title is None:
-                    title = self.tag
                 line = line[2:]
+            elif line.startswith("## "):
+                self.set_tag(Heading3)
+                line = line[3:]
             elif line.startswith("- "):
                 if not self.set_tag(BulletList):
                     self.tag.lines = None
@@ -216,7 +300,4 @@ class MDRenderer:
             else:
                 self.set_tag(Paragraph)
             self.tag.lines.append(line)
-        return (
-            title.render_inner() if title else None,
-            os.linesep.join(t.render_outer() for t in self.tags),
-        )
+        return os.linesep.join(t.render_outer() for t in self.tags)
index 88793e898ade4d3d2d37e316c9a9e0935621dbb3..be88228ea3dd84cedc4efa1db4798c3404215e31 100755 (executable)
@@ -45,6 +45,10 @@ generate_dir=/dev/shm/mar77i.info
 mkdir "${generate_dir}"
 git --work-tree="${generate_dir}" --git-dir="${git_dir}" checkout master -f
 cd "${generate_dir}"
+python3 -m venv venv
+. venv/bin/activate
+pip install -U pip
+pip install -r requirements.txt
 ./generate.py --output-dir "${HOME}/webroot/www.mar77i.info"
 cd ..
 rm -rf "${generate_dir}"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644 (file)
index 0000000..71c6b59
--- /dev/null
@@ -0,0 +1,3 @@
+elementpath==4.8.0
+ruff==0.11.0
+xmlschema==3.4.5
index c6c6af81cc5733db418c92d021ea2c03c2fb4674..df39f21b43558df885a9f40e2b32dbaeb73c30ce 100644 (file)
@@ -1,10 +1,12 @@
 <!DOCTYPE html>
 <html lang="en">
   <head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <meta charset="utf-8">
+    <meta
+      name="viewport"
+      content="width=device-width, initial-scale=1.0, maximum-scale=5">
     <meta http-equiv="X-UA-Compatible" content="ie=edge">
-    <title>mar77i.info ¬ {title}</title>
+    {pagemeta}
     <link rel="stylesheet" href="/style.css">
   </head>
   <body>
diff --git a/utils.py b/utils.py
deleted file mode 100644 (file)
index a5f2d80..0000000
--- a/utils.py
+++ /dev/null
@@ -1,14 +0,0 @@
-def get_content(path):
-    with path.open("rt") as fh:
-        return fh.read()
-
-
-def write_content(path, content):
-    print(f"writing {path}")
-    with path.open("wt") as fh:
-        fh.write(content)
-
-
-def find_or_end(s, sub, pos=0):
-    pos = s.find(sub, pos)
-    return pos if pos >= 0 else len(s)