Newer
Older
cortex-hub / browser-service / src / extraction / markdown.py
import re
from bs4 import BeautifulSoup

class MarkdownExtractor:
    """Extracts clean, readable markdown from HTML documents."""
    
    def __init__(self):
        # Tags we generally want to remove completely
        self.ignore_tags = ['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript', 'svg', 'canvas']
        
    def extract(self, html: str) -> str:
        if not html:
            return ""
            
        try:
            soup = BeautifulSoup(html, 'html.parser')
            
            # 1. Strip useless tags
            for tag in soup(self.ignore_tags):
                tag.decompose()
                
            # 2. Extract Main Content (heuristics)
            # We try to find article, main, or divs with high word count
            main_content = soup.find('main') or soup.find('article') or soup.find('div', id=re.compile(r'content|main|article', re.I)) or soup.find('body')
            
            if not main_content:
                return "No readable content found."
                
            # 3. Basic Markdown Conversion
            markdown = []
            
            for element in main_content.find_all(['h1', 'h2', 'h3', 'p', 'li', 'pre', 'code']):
                text = element.get_text(strip=True)
                if not text:
                    continue
                    
                tag_name = element.name
                if tag_name == 'h1':
                    markdown.append(f"# {text}\n")
                elif tag_name == 'h2':
                    markdown.append(f"## {text}\n")
                elif tag_name == 'h3':
                    markdown.append(f"### {text}\n")
                elif tag_name == 'p':
                    markdown.append(f"{text}\n")
                elif tag_name == 'li':
                    markdown.append(f"- {text}")
                elif tag_name in ['pre', 'code']:
                    markdown.append(f"```\n{text}\n```\n")
            
            # Joing with single newline but li needs double if ending a list
            result = "\n".join(markdown)
            
            # 4. Final Cleanup
            result = re.sub(r'\n{3,}', '\n\n', result)
            return result.strip()
            
        except Exception as e:
            return f"Extraction error: {e}"