#!/usr/bin/env python3 import re, pathlib import docx # from python-docx BASE = pathlib.Path(__file__).resolve().parent.parent RAW = BASE / "raw_docs" OUT = BASE / "converted" OUT.mkdir(exist_ok=True) def table_to_markdown(table) -> str: """Convert a python-docx table into Markdown format.""" rows = list(table.rows) if not rows: return "" # Extract text from each cell data = [] for row in rows: row_data = [] for cell in row.cells: cell_text = cell.text.strip() if not cell_text: cell_text = " " cell_text = cell_text.replace('\n', ' ') cell_text = ' '.join(cell_text.split()) row_data.append(cell_text) data.append(row_data) if not data: return "" # Ensure all rows same length max_cols = max(len(row) for row in data) for row in data: while len(row) < max_cols: row.append(" ") # Build markdown table header = "| " + " | ".join(data[0]) + " |" sep = "| " + " | ".join(["---"] * len(data[0])) + " |" body = ["| " + " | ".join(row) + " |" for row in data[1:]] return "\n".join([header, sep] + body) def get_paragraph_formatting(paragraph): """Extract formatting information from a paragraph.""" text = paragraph.text.strip() if not text: return None is_bold = any(run.bold for run in paragraph.runs if run.text.strip()) is_italic = any(run.italic for run in paragraph.runs if run.text.strip()) return { 'text': text, 'bold': is_bold, 'italic': is_italic } def format_paragraph(para_info): """Format paragraph based on bold/italic.""" if not para_info: return "" text = para_info['text'] if para_info['bold'] and para_info['italic']: return f"__*{text}*__" elif para_info['bold']: return f"__{text}__" elif para_info['italic']: return f"*{text}*" return text def clean_and_normalize(text: str) -> str: """Normalize Vietnamese legal document structure with proper hierarchy.""" lines = text.split('\n') processed_lines = [] # First pass: Convert basic formatting and handle Khoản/Điểm for i, line in enumerate(lines): original_line = line line = line.strip() if not line: processed_lines.append(original_line) continue # CHƯƠNG -> # if re.match(r"^__CHƯƠNG\s+[IVXLC]+", line): line = "# " + re.sub(r"__", "", line) processed_lines.append(line) continue # Điều -> ## if re.match(r"^__Điều\s+\d+", line): line = "## " + re.sub(r"__", "", line) processed_lines.append(line) continue # Handle mixed formatting first (before other rules) # Check for bold+italic header followed by plain text: __*header*__ content bold_italic_match = re.match(r"^__\*(.*?)\*__(.*)$", line) if bold_italic_match: header_text = bold_italic_match.group(1).strip() content_text = bold_italic_match.group(2).strip() # Check if header starts with number, letter, or asterisk if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", header_text): # Split at colon if present if ':' in header_text: parts = header_text.split(':', 1) header = parts[0].strip() + ':' header_content = parts[1].strip() processed_lines.append("#### Điểm " + header) if header_content: processed_lines.append(header_content) if content_text: processed_lines.append(content_text) else: processed_lines.append("#### Điểm " + header_text) if content_text: processed_lines.append(content_text) continue # Check if this line is standalone (at beginning of paragraph) prev_line_empty = (i == 0 or not lines[i-1].strip()) prev_line_is_header = (i > 0 and lines[i-1].strip() and (re.match(r'^__Điều\s+\d+', lines[i-1].strip()) or lines[i-1].strip().startswith(('##', '###', '####')))) # Also consider it standalone if previous line is a bullet point or content line prev_line_is_content = (i > 0 and lines[i-1].strip() and (lines[i-1].strip().startswith(('-', '+', '*')) or re.match(r'^[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]', lines[i-1].strip()))) is_standalone = prev_line_empty or prev_line_is_header or prev_line_is_content # Khoản: Bold text (__text__), standalone, starts with number if re.match(r"^__\d+\.\s+.*__$", line) and is_standalone: clean_text = re.sub(r"^__(.*)__$", r"\1", line) khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_text) if khoan_match: number = khoan_match.group(1) content = khoan_match.group(2) line = f"### Khoản {number}. {content}" else: line = "### Khoản " + clean_text processed_lines.append(line) continue # Handle plain numbered items that follow Điều and look like section headers if (re.match(r"^\d+\.\s+[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]", line) and is_standalone and len(line.split()) <= 8): # Short enough to be a header # Check if previous non-empty line is a Điều prev_content_idx = i - 1 while prev_content_idx >= 0 and not lines[prev_content_idx].strip(): prev_content_idx -= 1 if (prev_content_idx >= 0 and (re.match(r'^__Điều\s+\d+', lines[prev_content_idx].strip()) or lines[prev_content_idx].strip().startswith('## Điều'))): khoan_match = re.match(r"^(\d+)\.\s+(.*)", line) if khoan_match: number = khoan_match.group(1) content = khoan_match.group(2) line = f"### Khoản {number}. {content}" processed_lines.append(line) continue # Note: Only bold numbered items should be converted to Khoản # Plain numbered items should remain as regular numbered lists # Điểm: Bold+italic text (__*text*__), standalone, starts with number or letter if re.match(r"^__\*.*\*__$", line) and is_standalone: clean_text = re.sub(r"^__\*(.*)\*__$", r"\1", line) # Check if starts with number (1., 1.1., etc.), letter (a), b), etc.), or asterisk (*) if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", clean_text): # Check if there's content after a colon that should be separated if ':' in clean_text: parts = clean_text.split(':', 1) header = parts[0].strip() + ':' content = parts[1].strip() processed_lines.append("#### Điểm " + header) if content: processed_lines.append(content) else: processed_lines.append("#### Điểm " + clean_text) continue # Điểm: Just italic text (*text*), standalone, starts with number or letter if re.match(r"^\*.*\*$", line) and is_standalone: clean_text = re.sub(r"^\*(.*)\*$", r"\1", line) # Check if starts with number (1., 1.1., etc.), letter (a), b), etc.), or asterisk (*) if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", clean_text): # Check if there's content after a colon that should be separated if ':' in clean_text: parts = clean_text.split(':', 1) header = parts[0].strip() + ':' content = parts[1].strip() processed_lines.append("#### Điểm " + header) if content: processed_lines.append(content) else: processed_lines.append("#### Điểm " + clean_text) continue # Handle numbered sub-items like "1.1.", "1.2.", etc. - bold format if re.match(r"^__\d+\.\d+\.\s+.*__$", line): clean_text = re.sub(r"^__(.*)__$", r"\1", line) line = "#### Điểm " + clean_text processed_lines.append(line) continue # Handle numbered sub-items like "1.1.", "1.2.", etc. - italic format if re.match(r"^\*\d+\.\d+\.\s+.*\*$", line): clean_text = re.sub(r"^\*(.*)\*$", r"\1", line) line = "#### Điểm " + clean_text processed_lines.append(line) continue # Handle asterisk items that are bold+italic: __** text*__ if re.match(r"^__\*\*.*\*__$", line) and is_standalone: clean_text = re.sub(r"^__\*\*(.*)\*__$", r"\1", line) line = "#### Điểm *" + clean_text processed_lines.append(line) continue # Note: Plain numbered sub-items should remain as regular text # Only bold or italic formatted items should be converted to Điểm # Handle lettered items like "a)", "b)", "c)", etc. that are bold+italic if re.match(r"^__\*[a-z]\)\s+.*\*__$", line): clean_text = re.sub(r"^__\*(.*)\*__$", r"\1", line) # Check if there's content after a colon that should be separated if ':' in clean_text: parts = clean_text.split(':', 1) header = parts[0].strip() + ':' content = parts[1].strip() processed_lines.append("#### Điểm " + header) if content: processed_lines.append(content) else: processed_lines.append("#### Điểm " + clean_text) continue # Handle mixed formatting: bold/italic header + plain text content on same line # Example: "__*1. Header:*__ Plain text content" should become "#### Điểm 1. Header:" + "Plain text content" # Check for bold header followed by plain text # Pattern: __header text__ remaining plain text bold_match = re.match(r"^(__.*?__)\s*(.*)$", line) if bold_match: header_part = bold_match.group(1) content_part = bold_match.group(2).strip() # Process the header part for Khoản clean_header = re.sub(r"^__(.*)__$", r"\1", header_part) if re.match(r"^\d+\.\s+", clean_header): khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_header) if khoan_match: number = khoan_match.group(1) header_content = khoan_match.group(2) processed_lines.append(f"### Khoản {number}. {header_content}") if content_part: processed_lines.append(content_part) continue # Process the header part for Điểm elif re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\))\s+", clean_header): processed_lines.append("#### Điểm " + clean_header) if content_part: processed_lines.append(content_part) continue # Check for italic header followed by plain text # Pattern: *header text* remaining plain text italic_match = re.match(r"^(\*.*?\*)\s*(.*)$", line) if italic_match: header_part = italic_match.group(1) content_part = italic_match.group(2).strip() # Process the header part clean_header = re.sub(r"^\*(.*)\*$", r"\1", header_part) if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\))\s+", clean_header): processed_lines.append("#### Điểm " + clean_header) if content_part: processed_lines.append(content_part) continue # Note: Plain numbered items should remain as regular numbered lists # Only convert to Điểm if they have proper formatting (bold/italic) processed_lines.append(line) # No need for Text: labels, just return the processed content return '\n'.join(processed_lines).strip() def convert_doc_to_md(doc_path, md_path): """Convert document (paragraphs + tables) to Markdown with normalization.""" doc = docx.Document(doc_path) markdown_lines = [] for element in doc.element.body: if element.tag.endswith('tbl'): # Table table = docx.table.Table(element, doc) md_table = table_to_markdown(table) if markdown_lines and markdown_lines[-1].strip(): markdown_lines.append("") markdown_lines.append(md_table) markdown_lines.append("") elif element.tag.endswith('p'): # Paragraph paragraph = docx.text.paragraph.Paragraph(element, doc) para_info = get_paragraph_formatting(paragraph) if para_info and para_info['text']: markdown_lines.append(format_paragraph(para_info)) # Join + normalize final_text = '\n'.join(markdown_lines) final_text = clean_and_normalize(final_text) md_path.write_text(final_text, encoding="utf-8") return md_path if __name__ == "__main__": for doc in RAW.iterdir(): if doc.suffix.lower() not in [".doc", ".docx"]: print("Skipping:", doc); continue out = OUT / (doc.stem + ".md") convert_doc_to_md(doc, out) print("Converted:", out)