sa6anw.se/split_markdown.py

41 lines
1.1 KiB
Python

import os
import re
SOURCE_DIR = "docs"
OUTPUT_DIRS = {
"sv": "docs_sv",
"en": "docs_en"
}
lang_block_re = re.compile(r":::\s*(sv|en)\n(.*?)\n:::", re.DOTALL)
def split_content_by_language(content):
blocks = {"sv": "", "en": ""}
pos = 0
for match in lang_block_re.finditer(content):
start, end = match.span()
lang, text = match.groups()
neutral = content[pos:start]
for l in blocks:
blocks[l] += neutral
blocks[lang] += text.strip() + "\n\n"
pos = end
tail = content[pos:]
for l in blocks:
blocks[l] += tail
return blocks
for filename in os.listdir(SOURCE_DIR):
if filename.endswith(".md"):
filepath = os.path.join(SOURCE_DIR, filename)
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
blocks = split_content_by_language(content)
for lang, text in blocks.items():
os.makedirs(OUTPUT_DIRS[lang], exist_ok=True)
out_path = os.path.join(OUTPUT_DIRS[lang], filename)
with open(out_path, "w", encoding="utf-8") as f:
f.write(text.strip())