Reticulum/docs/clean_md.py
2026-05-07 02:16:41 +02:00

125 lines
3.5 KiB
Python
Executable file

#!/usr/bin/env python3
import os
import sys
import re
from pathlib import Path
LINE_START_PATTERNS = [
r'<a\s+', # HTML anchor tags: <a id="..."></a>
r'\\\\newpage', # LaTeX newpage commands
]
LINE_ANY_PATTERNS = [
# r'<div[^>]*>',
# r'</div>',
]
def compile_patterns():
start_patterns = [re.compile(p) for p in LINE_START_PATTERNS]
any_patterns = [re.compile(p) for p in LINE_ANY_PATTERNS]
return start_patterns, any_patterns
def should_remove_line(line, start_patterns, any_patterns):
stripped = line.strip()
for pattern in start_patterns:
if pattern.match(stripped):
return True
for pattern in any_patterns:
if pattern.search(stripped):
return True
return False
def clean_markdown_content(content, start_patterns, any_patterns, api_ref=False):
lines = content.split('\n')
result = []
skip_next_empty = False
for i, line in enumerate(lines):
if should_remove_line(line, start_patterns, any_patterns):
skip_next_empty = True
continue
if skip_next_empty:
if line.strip() == '': continue
else: skip_next_empty = False
if api_ref:
if line.startswith("### ") or line.startswith("#### "):
line = line.replace("*", "")
line = line.replace("\\_", "_")
if line.startswith("### "): line = line.replace("### ", "### `")
if line.startswith("#### "): line = line.replace("#### ", "#### `")
line = f"{line}`"
result.append(line)
# Remove trailing empty lines from end of file
while result and result[-1].strip() == '':
result.pop()
return '\n'.join(result)
def process_file(filepath, start_patterns, any_patterns):
try:
with open(filepath, 'r', encoding='utf-8') as f:
original_content = f.read()
api_ref = str(filepath) == "markdown/reference.md"
cleaned_content = clean_markdown_content(original_content, start_patterns, any_patterns, api_ref=api_ref)
if cleaned_content != original_content:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(cleaned_content)
return True
return False
except Exception as e:
print(f"Error processing {filepath}: {e}", file=sys.stderr)
return False
def find_markdown_files(directory):
md_files = []
for root, _, files in os.walk(directory):
for filename in files:
if filename.endswith('.md'): md_files.append(Path(root) / filename)
return md_files
def main():
if len(sys.argv) < 2:
print("Usage: python clean_markdown.py <directory_path>", file=sys.stderr)
sys.exit(1)
directory = sys.argv[1]
if not os.path.isdir(directory):
print(f"Error: '{directory}' is not a valid directory", file=sys.stderr)
sys.exit(1)
start_patterns, any_patterns = compile_patterns()
md_files = find_markdown_files(directory)
if not md_files:
print(f"No markdown files found in '{directory}'")
return
modified_count = 0
for filepath in md_files:
if process_file(filepath, start_patterns, any_patterns):
print(f"Cleaned: {filepath}")
modified_count += 1
print(f"\nProcessed {len(md_files)} file(s), modified {modified_count}")
if __name__ == '__main__':
main()