import re
from bs4 import BeautifulSoup
class MarkdownExtractor:
"""Extracts clean, readable markdown from HTML documents."""
def __init__(self):
# Tags we generally want to remove completely
self.ignore_tags = ['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript', 'svg', 'canvas']
def extract(self, html: str) -> str:
if not html:
return ""
try:
soup = BeautifulSoup(html, 'html.parser')
# 1. Strip useless tags
for tag in soup(self.ignore_tags):
tag.decompose()
# 2. Extract Main Content (heuristics)
# We try to find article, main, or divs with high word count
main_content = soup.find('main') or soup.find('article') or soup.find('div', id=re.compile(r'content|main|article', re.I)) or soup.find('body')
if not main_content:
return "No readable content found."
# 3. Basic Markdown Conversion
markdown = []
for element in main_content.find_all(['h1', 'h2', 'h3', 'p', 'li', 'pre', 'code']):
text = element.get_text(strip=True)
if not text:
continue
tag_name = element.name
if tag_name == 'h1':
markdown.append(f"# {text}\n")
elif tag_name == 'h2':
markdown.append(f"## {text}\n")
elif tag_name == 'h3':
markdown.append(f"### {text}\n")
elif tag_name == 'p':
markdown.append(f"{text}\n")
elif tag_name == 'li':
markdown.append(f"- {text}")
elif tag_name in ['pre', 'code']:
markdown.append(f"```\n{text}\n```\n")
# Joing with single newline but li needs double if ending a list
result = "\n".join(markdown)
# 4. Final Cleanup
result = re.sub(r'\n{3,}', '\n\n', result)
return result.strip()
except Exception as e:
return f"Extraction error: {e}"