sa6anw.se/split_lang.py

65 lines
2.0 KiB
Python

import os
import re
SOURCE_DIR = "docs"
OUTPUT_DIRS = {"sv": "docs_sv", "en": "docs_en"}
lang_block_re = re.compile(r":::\s*(sv|en)\n(.*?)\n:::", re.DOTALL | re.IGNORECASE)
frontmatter_re = re.compile(r"(?s)^---\n(.*?)\n---\n")
def split_content_by_language(content):
blocks = {"sv": "", "en": ""}
neutral_parts = []
# Ta ut frontmatter först (t.ex. title)
frontmatter_match = frontmatter_re.match(content)
frontmatter = ""
if frontmatter_match:
frontmatter = f"---\n{frontmatter_match.group(1)}\n---\n"
content = content[frontmatter_match.end():]
pos = 0
matches = list(lang_block_re.finditer(content))
if not matches:
# Inga språkblock alls → tolka som endast svenska
blocks["sv"] = frontmatter + content
blocks["en"] = ""
return blocks
for match in matches:
start, end = match.span()
lang, text = match.groups()
neutral = content[pos:start]
neutral_parts.append(neutral)
blocks[lang.lower()] += text.strip() + "\n\n"
pos = end
tail = content[pos:]
neutral_parts.append(tail)
neutral = ''.join(neutral_parts)
# Montera ihop språkfiler: frontmatter + språktext + neutral
for l in blocks:
blocks[l] = frontmatter + blocks[l].strip() + "\n\n" + neutral.strip()
return blocks
for filename in os.listdir(SOURCE_DIR):
if filename.endswith(".md"):
filepath = os.path.join(SOURCE_DIR, filename)
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
blocks = split_content_by_language(content)
for lang, text in blocks.items():
if text.strip():
os.makedirs(OUTPUT_DIRS[lang], exist_ok=True)
out_path = os.path.join(OUTPUT_DIRS[lang], filename)
with open(out_path, "w", encoding="utf-8") as f:
f.write(text.strip() + "\n")
print(f"✔ Skrev {lang}/{filename}")
else:
print(f"⚠️ Skippade {lang}/{filename} (tom)")