website/generator/process.py

from collections.abc import Generator
import enum
import json
import re
from pathlib import Path
from rich import print as rprint
import shutil
import jinja2
import feeds
import markdown
from typedef import *
from util import *
import itertools
import subprocess
import datetime
import os


class _FileType(enum.IntEnum):
    HTML_TEMPLATE = enum.auto()
    STATIC = enum.auto()
    MARKDOWN_TEMPLATE = enum.auto()


def _walk_content(
    start_dir: str | Path,
) -> Generator[tuple[str, _FileType], None, None]:
    if type(start_dir) is not Path:
        start_dir = Path(start_dir)

    for item in os.listdir(start_dir):
        if item.startswith("_"):
            continue

        p = start_dir / item

        if os.path.isdir(p):
            yield from _walk_content(p)
            continue

        match "" if len((sp := item.split("."))) <= 1 else ".".join(sp[1:]).lower():
            case "html":
                ftype = _FileType.HTML_TEMPLATE
            case "md":
                ftype = _FileType.MARKDOWN_TEMPLATE
            case _:
                ftype = _FileType.STATIC

        yield p, ftype


def _make_canonical_url(site_config: any, path: Path | str) -> str:
    path_str = str(path).strip("/")
    if path_str == ".":
        path_str = ""
    return (
        site_config["baseURL"].rstrip("/")
        + "/"
        + path_str
        + ("/" if path_str != "" else "")
    )


def content(base_dir: Path, output_dir: Path, jinja_env: jinja2.Environment, site_config: any):
    markdown_to_html = markdown.create(escape=False)

    walk_dir = base_dir / "content"
    for fpath, filetype in _walk_content(walk_dir):
        site_inner_path = fpath.relative_to(
            walk_dir
        )  # the path of the file *inside* a site directory structure (eg. inside of `_dist` or inside of `content`)

        tpl_frontmatter, raw_tpl = None, None

        with open(fpath) as f:
            try:
                tpl_frontmatter, raw_tpl = extract_frontmatter(f.read())
            except ValueError:
                # could not parse template so this is going to have to copied as a static file
                pass

        if (
            tpl_frontmatter and raw_tpl
        ):  # do we have a template we can make context for and render?
            render_as_directory = (
                bool(t)
                if (t := tpl_frontmatter.get("asDirectory")) is not None
                else not (
                    site_inner_path.stem.lower() == "index"
                    and site_inner_path.suffix.lower() in [".md", ".html"]
                )
            )

            target_path = output_dir / (
                site_inner_path.with_suffix(".html")
                if not render_as_directory
                else site_inner_path.parent
                / site_inner_path.name.split(".")[0]
                / "index.html"
            )
            os.makedirs(target_path.parent, exist_ok=True)

            ctx = {"site": site_config, "data": DataLoader(base_dir / "data")}

            if "canonicalURL" not in tpl_frontmatter and not tpl_frontmatter.get(
                "preserveCanonicalURL", False
            ):
                internal_site_path = target_path.relative_to(output_dir)
                tpl_frontmatter["canonicalURL"] = _make_canonical_url(
                    site_config,
                    (
                        internal_site_path.parent
                        if internal_site_path.name.lower() == "index.html"
                        else internal_site_path
                    ),
                )

            ctx["page"] = tpl_frontmatter
            ctx["file"] = {
                "mod_time": datetime.datetime.fromtimestamp(os.stat(fpath).st_mtime)
            }

            match filetype:
                case _FileType.HTML_TEMPLATE:
                    if not tpl_frontmatter.get("isTemplate", True):
                        # this is only done since we need to set a jinja template object for later on
                        ctx["x"] = raw_tpl
                        tpl = jinja_env.from_string("{{ x | safe }}")
                    else:
                        tpl = jinja_env.from_string(raw_tpl)
                case _FileType.MARKDOWN_TEMPLATE:
                    if tpl_frontmatter.get("isTemplate", False):
                        raw_tpl = jinja_env.from_string(raw_tpl).render(ctx)

                    ctx["rendered"], render_state = markdown_to_html.parse(raw_tpl)

                    tpl_str = '{% extends "_layouts/base.html" %}{% block main %}{{ rendered | safe }}{% endblock %}'

                    if tpl_frontmatter.get("showToc", False):
                        ctx["toc"] = markdown.render_toc_from_state(
                            render_state, min_level=2
                        )
                        tpl_str += '{% import "_imports/toc.html" as tc %}{% block aside %}{{ tc.render(toc) }}{% endblock %}'

                    tpl = jinja_env.from_string(tpl_str)

                case _:
                    assert False, "impossible state"

            res = tpl.render(ctx)

            with open(target_path, "w") as f:
                f.write(res)
        else:
            if (
                fpath.suffix.lower() != ".scss"
            ):  # sass itself runs after this and will handle this for us
                if filetype != _FileType.STATIC:
                    rprint(
                        WARN_LEADER
                        + f"Treating [bold]{fpath}[/bold] (type {filetype.name}) as a static file"
                    )
                target_path = output_dir / site_inner_path
                os.makedirs(target_path.parent, exist_ok=True)
                shutil.copy(fpath, target_path)


def sass(base_dir: Path, output_dir: Path):
    subprocess.run(
        [
            "sass",
            "--style=compressed",
            str(base_dir / "content/assets/css")
            + ":"
            + str(output_dir / "html/assets/css"),
        ]
    ).check_returncode()


BLOG_DATE_FORMAT = "%Y-%m-%d"
BLOG_TAG_RE = re.compile(r"[a-zA-Z\d-]+")


def blog(base_dir: Path, output_dir: Path, jinja_env: jinja2.Environment, site_config: any):
    markdown_to_html = markdown.create(escape=False, header_level_delta=1)

    walk_dir = base_dir / "blog"
    posts = {}
    for fpath, filetype in _walk_content(walk_dir):
        inner_path = fpath.relative_to(walk_dir)

        match filetype:
            case _FileType.MARKDOWN_TEMPLATE:
                with open(fpath) as f:
                    post_frontmatter, raw_post_md = extract_frontmatter(f.read())

                post_slug = (
                    fpath.name[:-3] if fpath.name != "content.md" else fpath.parent.name
                )

                if post_slug in posts:
                    rprint(
                        ERROR_LEADER + f"Duplicate post slug [bold]{post_slug}[/bold]"
                    )
                    raise SystemExit(1)

                # check required keys
                missing_keys = [
                    key
                    for key in ["title", "publishedDate"]
                    if key not in post_frontmatter
                ]
                if len(missing_keys) > 0:
                    rprint(
                        ERROR_LEADER
                        + f"Post [bold]{post_slug}[/bold] missing the following frontmatter keys: "
                        + ",".join(missing_keys)
                    )
                    raise SystemExit(1)

                # check tags are valid
                if "tags" in post_frontmatter:
                    invalid_tags = [
                        tag
                        for tag in post_frontmatter["tags"]
                        if not BLOG_TAG_RE.fullmatch(tag)
                    ]
                    if len(invalid_tags) > 0:
                        rprint(
                            ERROR_LEADER
                            + f"Post [bold]{post_slug}[/bold] has the following invalid tags: "
                            + ",".join(map(repr, invalid_tags))
                        )
                        raise SystemExit(1)

                target_path = output_dir / "blog" / post_slug / "index.html"
                os.makedirs(target_path.parent, exist_ok=True)

                if "updatedDate" in post_frontmatter:
                    post_frontmatter["updatedDate"] = list(
                        sorted(post_frontmatter["updatedDate"], reverse=True)
                    )

                posts[post_slug] = post_frontmatter

                rendered_html, render_state = markdown_to_html.parse(raw_post_md)

                # build jinja context
                ctx = {
                    "site": site_config,
                    "post": post_frontmatter,
                    "content": rendered_html,
                    "toc": markdown.render_toc_from_state(render_state),
                    "page": {
                        k: post_frontmatter[k]
                        for k in ["title", "description", "imageURL"]
                        if k in post_frontmatter
                    },
                }
                ctx["page"]["canonicalURL"] = _make_canonical_url(
                    site_config, f"/blog/{post_slug}"
                )

                # execute jinja template
                tpl = jinja_env.get_template("_layouts/blog/post.html")
                res = tpl.render(ctx)

                # dump to file
                with open(target_path, "w") as f:
                    f.write(res)

            case _:
                if filetype != _FileType.STATIC:
                    rprint(
                        WARN_LEADER
                        + f"Treating [bold]{fpath}[/bold] (type {filetype.name}) as a static file"
                    )
                target_path = output_dir / "blog" / inner_path
                os.makedirs(target_path.parent, exist_ok=True)
                shutil.copy(fpath, target_path)

    # generate listing
    post_list = []
    tags = {}
    for slug in posts:
        post = posts[slug]

        if (pv := post.get("visible", {})) == False or not pv.get("list", True):
            print(slug)
            continue

        post_list.append(
            AbbreviatedPost(
                slug,
                post["title"],
                post.get("description", ""),
                post["publishedDate"],
                (
                    None
                    if "updatedDate" not in post or len(post["updatedDate"]) == 0
                    else post["updatedDate"][0]
                ),
                "favourite" in (post_tags := (post["tags"] if "tags" in post else [])),
                post_tags,
            )
        )

        if "tags" in post:
            for tag in post["tags"]:
                tags[tag] = tags.get(tag, 0) + 1

    try:
        tag_descriptions = load_yaml(open(base_dir / "blogTags.yml").read())
    except FileNotFoundError:
        tag_descriptions = {}

    tag_list = [AbbreviatedTag(x, tag_descriptions.get(x), tags[x]) for x in tags]

    # Here, we sort twice - once to get the tag list sorted alphabetically going in ascending order
    # and a second time to get the tag list sorted by presence of description and article count in
    # descending order.
    tag_list = sorted(tag_list, key=lambda x: x.slug)
    tag_list = sorted(
        tag_list,
        key=lambda x: (x.description is not None, x.articleCount),
        reverse=True,
    )

    post_list = list(
        sorted(
            post_list,
            key=lambda x: x.publishedDate,
            reverse=True,
        )
    )

    with open(output_dir / "blog" / "index.html", "w") as f:
        tpl = jinja_env.get_template("_layouts/blog/index.html")
        r = tpl.render(
            {
                "site": site_config,
                "page": {
                    "title": "Blog",
                    "canonicalURL": _make_canonical_url(site_config, "/blog/"),
                    "showAside": True,
                },
                "posts": post_list,
            }
        )
        f.write(r)


    # generate tag list
    tags_output_dir = output_dir / "blog" / "tags"
    os.makedirs(tags_output_dir, exist_ok=True)
    with open(tags_output_dir / "index.html", "w") as f:
        tpl = jinja_env.get_template("_layouts/blog/tags.html")
        r = tpl.render(
            {
                "site": site_config,
                "page": {
                    "title": "Blog tags",
                    "canonicalURL": _make_canonical_url(site_config, "/blog/tags/"),
                    "showAside": True,
                },
                "tags": tag_list,
            }
        )
        f.write(r)

    # generate tag-specific index pages
    tpl = jinja_env.get_template("_layouts/blog/postsFilteredByTag.html")
    for tag in tag_list:
        d = tags_output_dir / tag.slug
        os.makedirs(d, exist_ok=True)
        with open(d / "index.html", "w") as f:
            f.write(
                tpl.render(
                    {
                        "site": site_config,
                        "page": {
                            "title": f"{tag.slug} :: Blog Tags",
                            "canonicalURL": _make_canonical_url(
                                site_config, f"/blog/tags/{tag.slug}/"
                            ),
                            "showAside": True,
                        },
                        "tag": tag,
                        "posts": [p for p in post_list if tag.slug in p.tags],
                    }
                )
            )

    feed_post_list = list(filter(lambda x: (p := posts[x.slug].get("visible", {})) != False and p.get("feed", True), post_list))[:10]

    # generate feeds
    with open(output_dir / "blog" / "feed.atom", "w") as f:
        f.write(feeds.atom(site_config, feed_post_list))

    with open(output_dir / "blog" / "feed.json", "w") as f:
        f.write(feeds.json(site_config, feed_post_list))


def caddy_config(
    base_dir: Path, output_dir: Path, jinja_env: jinja2.Environment, site_config: any
):
    try:
        x = open(base_dir / "redirects.yml").read()
        raw_redirect_rules = load_yaml(x)
    except FileNotFoundError:
        raw_redirect_rules = []

    redirects = []

    for i, rule in enumerate(raw_redirect_rules):
        if "from" not in rule or "to" not in rule:
            rprint(ERROR_LEADER + f"Redirect rule {i} missing either from or to field")
            raise SystemExit(1)
        status = rule.get("code", 302)
        if not ((300 <= status <= 399) or status == 401):
            rprint(
                ERROR_LEADER
                + f"Redirect rule {i} has an invalid status code (not 3xx or 401)"
            )
            raise SystemExit(1)
        redirects.append(
            {
                "match": [{"path": [rule["from"]]}],
                "handle": [
                    {
                        "handler": "static_response",
                        "headers": {
                            "Location": [rule["to"]],
                        },
                        "status_code": rule.get("code", 302),
                    }
                ],
            }
        )

        if not rule["from"].endswith("/") and rule.get("transform", True):
            redirects.append(
                {
                    "match": [{"path": [rule["from"] + "/"]}],
                    "handle": [
                        {
                            "handler": "static_response",
                            "headers": {
                                "Location": [rule["to"]],
                            },
                            "status_code": rule.get("code", 302),
                        }
                    ],
                }
            )

    conf = {
        "admin": {"disabled": True},
        "apps": {
            "http": {
                "servers": {
                    "srv0": {
                        "automatic_https": {"disable": True},
                        "listen": [":8080"],
                        "routes": [
                            {
                                "handle": [
                                    {
                                        "handler": "headers",
                                        "response": {
                                            "deferred": True,
                                            "delete": ["Server"],
                                        },
                                    }
                                ]
                            },
                            {
                                "handle": [
                                    {
                                        "handler": "subroute",
                                        "routes": [
                                            *redirects,
                                            {
                                                "handle": [
                                                    {
                                                        "handler": "file_server",
                                                        "root": "./html",
                                                    }
                                                ],
                                            },
                                        ],
                                        "errors": {
                                            "routes": [
                                                {
                                                    "handle": [
                                                        {
                                                            "handler": "rewrite",
                                                            "uri": "/404.html",
                                                        },
                                                        {
                                                            "handler": "file_server",
                                                            "root": "./html",
                                                        },
                                                    ],
                                                    "terminal": True,
                                                },
                                            ],
                                        },
                                    },
                                ],
                            },
                        ],
                    },
                },
            },
        },
    }

    with open(output_dir / "caddy_config.json", "w") as f:
        json.dump(conf, f, indent="\t")


def compress_png(output_dir: Path):
    pngcrush_exe: str | None = shutil.which("pngcrush")

    if pngcrush_exe is None:
        rprint(
            WARN_LEADER + "cannot find pngcrush executable, skipping PNG compression"
        )
        return

    for image in itertools.chain(
        output_dir.rglob("*.png"),
        output_dir.rglob("*.PNG"),
    ):
        proc: subprocess.CompletedProcess = subprocess.run([pngcrush_exe, "-ow", image])
        proc.check_returncode()


def strip_exif(output_dir: Path):
    mogrify_exe: str | None = shutil.which("mogrify")

    if mogrify_exe is None:
        rprint(
            WARN_LEADER
            + "cannot find mogrify (of imagemagick) executable, skipping EXIF data removal"
        )
        return

    for image in itertools.chain(
        output_dir.rglob("*.jpg"),
        output_dir.rglob("*.JPG"),
        output_dir.rglob("*.jpeg"),
        output_dir.rglob("*.JPEG"),
        output_dir.rglob("*.png"),
        output_dir.rglob("*.PNG"),
    ):
        proc: subprocess.CompletedProcess = subprocess.run(
            [mogrify_exe, "-strip", image]
        )
        proc.check_returncode()