From 9d8b315d86472ee7787755819ea0794e1249551e Mon Sep 17 00:00:00 2001 From: mar77i Date: Mon, 24 Mar 2025 22:39:01 +0100 Subject: [PATCH] add sitemap --- content/blog/index.md | 11 ++ content/nav.md | 2 +- generate.py | 232 +++++++++++++++++++++++++++++++++++------- md.py | 131 +++++++++++++++++++----- post-receive.sh | 4 + requirements.txt | 3 + template.html | 8 +- utils.py | 14 --- 8 files changed, 327 insertions(+), 78 deletions(-) create mode 100644 content/blog/index.md create mode 100644 requirements.txt delete mode 100644 utils.py diff --git a/content/blog/index.md b/content/blog/index.md new file mode 100644 index 0000000..bc17520 --- /dev/null +++ b/content/blog/index.md @@ -0,0 +1,11 @@ +# Blog + +Blog overview, by [date](#by_date), or by [hashtags](#by_hashtags) + +## {id=by_date}Index + +blog-index + +## {id=by_hashtags}Hashtags + +hashtags diff --git a/content/nav.md b/content/nav.md index 6a75086..3e48e90 100644 --- a/content/nav.md +++ b/content/nav.md @@ -1,3 +1,3 @@ - [Home](/) -- [Blog](/blog) +- [Blog](/blog/) - [Git](https://git.mar77i.info/) diff --git a/generate.py b/generate.py index 53301ab..4f48f5f 100755 --- a/generate.py +++ b/generate.py @@ -1,51 +1,213 @@ #!/usr/bin/env python3 +import os import shutil +import sys +import xmlschema from argparse import ArgumentParser +from contextlib import contextmanager +from datetime import datetime, UTC +from functools import cached_property +from hashlib import sha256 +from itertools import chain from pathlib import Path +from shutil import rmtree +from xml.etree import ElementTree from md import MDRenderer -from utils import get_content, write_content + + +def path_len_parts(p): + return len(p.parts) + + +@contextmanager +def cleanup_existing_output(output_path): + existing_output = [] + if not output_path.exists(): + output_path.mkdir(0o755, True, True) + else: + for current, *dirs_and_files in os.walk(output_path): + current_path = Path(current) + existing_output.extend( + current_path / f for f in chain.from_iterable(dirs_and_files) + ) + yield existing_output + for path in sorted(existing_output, key=path_len_parts, reverse=True): + is_dir = path.is_dir() + print(f"deleting {str(path)}{'/' if is_dir else ''}") + if is_dir: + path.rmdir() + else: + path.unlink() + + +class WebsiteGenerator: + STATIC_FILES = [ + "style.css", + ] + SITEMAP_NAMESPACE = "http://www.sitemaps.org/schemas/sitemap/0.9" + SITEMAP_SCHEMA_URL = "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" + + def __init__(self, base_url, build_path, output_path): + self.base_url = base_url + if build_path.exists(): + shutil.rmtree(build_path) + build_path.mkdir(0o755, True) + self.build_path = build_path + self.output_path = output_path + self.source_path = Path(__file__).parent + with (self.source_path / "template.html").open("rt") as fh: + self.template = fh.read() + self.files_and_urls = [] + + @cached_property + def content_path(self): + return self.source_path / "content" + + AutoContent = type("AutoContentType", (), {})() + + def add_path(self, in_path, out_path, content=None, url=None): + if out_path in (x[0] for x in self.files_and_urls): + raise ValueError("Cannot add the same file multiple times") + self.files_and_urls.append((out_path, url)) + if in_path.is_dir(): + print(f"creating {str(out_path.relative_to(self.build_path))}/") + out_path.mkdir(0o755) + elif content is not None: + if content is self.AutoContent: + with in_path.open("rb") as fh: + content = fh.read() + print(f"writing {str(out_path.relative_to(self.build_path))}") + with out_path.open("wb" if isinstance(content, bytes) else "wt") as fh: + fh.write(content) + else: + raise ValueError("No content provided.") + + def render_page(self, nav, in_path, url): + with in_path.open("rt") as fh: + renderer = MDRenderer(fh.read(), url) + return self.template.format( + nav=nav, + page=renderer.render_html(), + pagemeta=renderer.render_html_pagemeta(), + ), url + + def get_url(self, out_path): + url = f"{self.base_url}{out_path.relative_to(self.build_path)}" + if url.endswith("/index.html"): + url = url[:-len("index.html")] + return url + + def build(self): + print("==> building", self.base_url) + with (self.content_path / "nav.md").open("rt") as fh: + nav = MDRenderer(fh.read()).render_html() + index_md = self.content_path / "index.md" + index_html = self.build_path / "index.html" + self.add_path( + index_md, + index_html, + *self.render_page(nav, index_md, self.get_url(index_html)), + ) + blog_md = self.content_path / "blog" / "index.md" + blog_html = self.build_path / "blog" / "index.html" + self.add_path(blog_md.parent, blog_html.parent) + self.add_path( + blog_md, + blog_html, + *self.render_page(nav, blog_md, self.get_url(blog_html)), + ) + for static_file in self.STATIC_FILES: + self.add_path( + self.source_path / static_file, + self.build_path / static_file, + self.AutoContent, + ) + + def sync(self): + print("==> syncing", self.base_url) + with cleanup_existing_output(self.output_path) as existing_output: + sitemap = [] + for src_path, url in self.files_and_urls: + rel_path = src_path.relative_to(self.build_path) + dest_path = self.output_path / rel_path + if dest_path in existing_output: + existing_output.remove(dest_path) + if src_path.is_dir(): + if not dest_path.exists(): + print("creating", str(rel_path)) + dest_path.mkdir(0o755) + continue + update = None if dest_path.exists() else "creating" + with src_path.open("rb") as fh: + src_content = fh.read() + if update is None: + with dest_path.open("rb") as fh: + if sha256(src_content).digest() != sha256(fh.read()).digest(): + update = "updating" + if update is not None: + print(update, rel_path) + with dest_path.open("wb") as out_fh: + out_fh.write(src_content) + if url is None: + continue + sitemap.append( + { + "loc": url, + "lastmod": datetime.fromtimestamp( + dest_path.stat().st_mtime, + UTC, + ).isoformat(timespec="seconds"), + } + ) + self.generate_sitemap( + self.output_path / "sitemap.xml", sitemap, existing_output + ) + + @classmethod + def generate_sitemap(cls, sitemap_xml, urls, existing_output): + if sitemap_xml in existing_output: + existing_output.remove(sitemap_xml) + schema = xmlschema.XMLSchema(cls.SITEMAP_SCHEMA_URL) + with open(sitemap_xml, "wb") as fh: + ElementTree.register_namespace("", cls.SITEMAP_NAMESPACE) + fh.write(b"\n") + fh.write( + ElementTree.tostring( + schema.encode( + { + "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "@xmlns": cls.SITEMAP_NAMESPACE, + "@xsi:schemaLocation": ( + f"{cls.SITEMAP_NAMESPACE} {cls.SITEMAP_SCHEMA_URL}" + ), + "url": urls, + } + ) + ) + ) + print(f"Validating XML {repr(sitemap_xml.name)}...", end=" ") + sys.stdout.flush() + schema.validate(sitemap_xml) + print("done") + + def cleanup(self): + rmtree(self.build_path) def main(): ap = ArgumentParser() + ap.add_argument("--base-url", default="https://www.mar77i.info/") + ap.add_argument("--build-dir", default="/dev/shm/build") ap.add_argument("--output-dir", default="/dev/shm/output") args = ap.parse_args() - source_path = Path(__file__).parent - content_path = source_path / "content" - output_path = Path(args.output_dir) - if output_path.exists(): - shutil.rmtree(output_path) - output_path.mkdir(0o755) - template = get_content(source_path / "template.html") - context = { - "nav": MDRenderer(get_content(content_path / "nav.md")).render_html()[1], - } - context["title"], context["page"] = MDRenderer( - get_content(content_path / "index.md") - ).render_html() - write_content(output_path / "index.html", template.format(**context)) - - #context["blogs"] = [] - #context["hashtags"] = [] - #for file in (content_path / "blog").iterdir(): - # if file.name == "index.md" or not file.name.endswith(".md"): - # continue - # context["blogs"].append(f"blog/{file.name[:-3]}.html") - # context["title"], context["page"] = MDRenderer(get_content(file)).render_html() - # write_content( - # blog_path / f"{file.name[:-3]}.html", template.render(context), - # ) - #context["title"], context["page"] = MDRenderer( - # get_content(content_path / "blog" / "index.md") - #).render_html() - - blog_path = output_path / "blog" - blog_path.mkdir(0o755) - context["title"], context["page"] = "Blog stub", "

Blog stub

" - write_content(blog_path / "index.html", template.format(**context)) - write_content(output_path / "style.css", get_content(source_path / "style.css")) + website_gen = WebsiteGenerator( + args.base_url, Path(args.build_dir), Path(args.output_dir) + ) + website_gen.build() + website_gen.sync() + website_gen.cleanup() if __name__ == "__main__": diff --git a/md.py b/md.py index 2ae5408..ebe700e 100644 --- a/md.py +++ b/md.py @@ -1,5 +1,7 @@ import os from io import StringIO +from html import escape +from urllib.parse import quote_plus _registered_tags = [] @@ -89,10 +91,11 @@ class MDLineTag: def __init__(self): self.lines = [] - self.content = "" self.sio = StringIO() + self.end_backslash = None self.backslash = False + self.attributes = [] self.states = [] def check_states(self, tag_class): @@ -105,14 +108,15 @@ class MDLineTag: return links[-1].text is None or links[-1].start is None return True - def handle_backslash(self, i, c): - if i == self.end_backslash: - self.end_backslash = None - self.backslash = False - return False - elif self.end_backslash is not None: - pass - elif self.content[i:i + len(os.linesep)] == os.linesep: + def handle_backslash(self, content, i, c): + if self.end_backslash is not None: + done = i == self.end_backslash + if done: + self.end_backslash = None + self.backslash = False + # continue only if we're not at end_backslash + return not done + if content[i:i + len(os.linesep)] == os.linesep: self.sio.write(" ") self.end_backslash = i + len(os.linesep) else: @@ -120,32 +124,85 @@ class MDLineTag: self.backslash = False return True +# def handle_attributes(self, c): +# if c not in " }=": +# return False +# self.sio.seek(self.attribute_since) +# value = self.sio.read() +# self.sio.seek(self.attribute_since) +# self.sio.truncate() +# +# if c == "=": +# if self.attribute_key is not None: +# raise AttributeError("Key already specified!") +# self.attribute_key = value +# return True +# if c == "}": +# self.attribute_since = None +# return True + def handle_tag(self, c): for tag_class in _registered_tags: if c == tag_class.char and self.check_states(tag_class): return tag_class(self.states, self.sio) return None + def maybe_get_attributes(self, content): + if not content.startswith("{"): + return content + sio = StringIO() + content = content[1:] + backslash = False + key_pos = None + for i, c in enumerate(content): + if backslash: + sio.write(c) + continue + if c in " }": + value = sio.getvalue() + if key_pos is None: + item = value, "" + else: + item = value[:key_pos], value[key_pos:] + self.attributes.append(item) + sio.seek(0) + sio.truncate() + if c == "}": + return content[i + 1:] + key_pos = None + continue + elif c == "\\": + backslash = True + continue + elif c == "=": + key_pos = sio.tell() + continue + sio.write(c) + raise ValueError("Attribute list: missing closing '}'") + def render_inner(self): if self.sio.getvalue(): - assert not self.lines + assert self.lines is None return self.sio.getvalue() - self.content = os.linesep.join(self.lines) - for i, c in enumerate(self.content): - if self.backslash and self.handle_backslash(i, c): + content = self.maybe_get_attributes(os.linesep.join(self.lines)) + for i, c in enumerate(content): + if self.backslash and self.handle_backslash(content, i, c): continue if c == "\\": self.backslash = True - continue - if not self.handle_tag(c): + elif not self.handle_tag(c): (self.states[-1] if self.states else self.sio).write(c) assert len(self.states) == 0, self.states assert not self.backslash - self.lines.clear() + self.lines = None return self.sio.getvalue() def render_outer(self): - return f"<{self.name}>{self.render_inner()}" + inner_html = self.render_inner() + sio = StringIO() + for key, value in self.attributes: + sio.write(f' {key}="{escape(value)}"' if value else f" {key}") + return f"<{self.name}{sio.getvalue()}>{inner_html}" class Paragraph(MDLineTag): @@ -156,6 +213,10 @@ class Heading2(MDLineTag): name = "h2" +class Heading3(MDLineTag): + name = "h3" + + class BulletList(MDLineTag): name = "ul" @@ -182,8 +243,9 @@ class MDRenderer: """ Simplified markdown to html translator. """ - def __init__(self, page): + def __init__(self, page, url=""): self.page = page + self.url = url self.sio = StringIO() self.tag = None self.tags = [] @@ -195,14 +257,36 @@ class MDRenderer: self.tags.append(self.tag) return True + def render_html_pagemeta(self): + """ + + + + mar77i.info ¬ {title} + """ + assert isinstance(self.tags[0], Heading2) + title = escape(self.tags[0].render_inner()) + if isinstance(self.tags[1], Paragraph): + description = self.tags[1].render_inner() + else: + description = "" + return os.linesep.join( + ( + f'', + f'', + f'', + f"mar77i.info ¬ {title}", + ) + ) + def render_html(self): - title = None for line in self.page.split(os.linesep): if line.startswith("# "): self.set_tag(Heading2) - if title is None: - title = self.tag line = line[2:] + elif line.startswith("## "): + self.set_tag(Heading3) + line = line[3:] elif line.startswith("- "): if not self.set_tag(BulletList): self.tag.lines = None @@ -216,7 +300,4 @@ class MDRenderer: else: self.set_tag(Paragraph) self.tag.lines.append(line) - return ( - title.render_inner() if title else None, - os.linesep.join(t.render_outer() for t in self.tags), - ) + return os.linesep.join(t.render_outer() for t in self.tags) diff --git a/post-receive.sh b/post-receive.sh index 88793e8..be88228 100755 --- a/post-receive.sh +++ b/post-receive.sh @@ -45,6 +45,10 @@ generate_dir=/dev/shm/mar77i.info mkdir "${generate_dir}" git --work-tree="${generate_dir}" --git-dir="${git_dir}" checkout master -f cd "${generate_dir}" +python3 -m venv venv +. venv/bin/activate +pip install -U pip +pip install -r requirements.txt ./generate.py --output-dir "${HOME}/webroot/www.mar77i.info" cd .. rm -rf "${generate_dir}" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..71c6b59 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +elementpath==4.8.0 +ruff==0.11.0 +xmlschema==3.4.5 diff --git a/template.html b/template.html index c6c6af8..df39f21 100644 --- a/template.html +++ b/template.html @@ -1,10 +1,12 @@ - - + + - mar77i.info ¬ {title} + {pagemeta} diff --git a/utils.py b/utils.py deleted file mode 100644 index a5f2d80..0000000 --- a/utils.py +++ /dev/null @@ -1,14 +0,0 @@ -def get_content(path): - with path.open("rt") as fh: - return fh.read() - - -def write_content(path, content): - print(f"writing {path}") - with path.open("wt") as fh: - fh.write(content) - - -def find_or_end(s, sub, pos=0): - pos = s.find(sub, pos) - return pos if pos >= 0 else len(s) -- 2.51.0