"""Note ingestion engine for the Second Brain RAG system."""

import json
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Optional

from pydantic import BaseModel

from health.utils.logging_config import setup_logger
from slack_bot.llm.gemini import GeminiLLM
from slack_bot.obsidian.vector_store import ChunkMetadata, ChromaVectorStore, split_md_by_headers

logger = setup_logger(__name__)


class ExtractedNote(BaseModel):
    """LLM-extracted structured note from raw input text."""

    title: str
    summary: str           # 2-3 sentences
    key_points: List[str]  # up to 7 items
    entities: List[str]    # people, concepts, projects; wrapped as [[entity]] in cleaned_body
    cleaned_body: str      # full text with entities wrapped in [[double brackets]]
    source_url: Optional[str] = None
    language: str = "zh"


class NoteIngester:
    """Ingests raw text or URLs into the Obsidian vault and vector store."""

    def __init__(self, vault_path: Path, vector_store: ChromaVectorStore) -> None:
        """Initialize the note ingester.

        Args:
            vault_path: Root path of the Obsidian vault.
            vector_store: ChromaDB vector store for indexing chunks.
        """
        self.vault_path = vault_path
        self.notes_dir = vault_path / "notes"
        self.notes_dir.mkdir(parents=True, exist_ok=True)
        self.vector_store = vector_store
        self.llm = GeminiLLM()

    def ingest(self, user_input: str) -> str:
        """Ingest text or a URL into the vault and vector store.

        Args:
            user_input: Raw text or a URL to ingest.

        Returns:
            Slack-formatted confirmation message.
        """
        url = self._detect_url(user_input)
        source_url: Optional[str] = None

        if url:
            logger.info(f"Detected URL: {url}. Fetching content...")
            try:
                raw_text = self._fetch_url(url)
                source_url = url
            except Exception as e:
                logger.warning(f"URL fetch failed ({e})")
                # 无法抓取（微信/付费墙等），提示用户手动粘贴正文
                return (
                    f"⚠️ 无法自动抓取该链接内容（{e}）\n\n"
                    f"请将文章正文直接粘贴发送给我，我来帮你保存入库。"
                )
        else:
            raw_text = user_input

        try:
            note = self._extract_with_llm(raw_text, source_url)
        except Exception as e:
            logger.error(f"LLM extraction failed: {e}")
            return f"❌ 内容提取失败（LLM 解析出错）：{e}\n\n请重试，或检查原始内容是否正常。"

        file_path = self._write_md_file(note)
        self._chunk_and_index(note, file_path)

        entity_links = " ".join(f"[[{e}]]" for e in note.entities[:5])
        rel_path = file_path.relative_to(self.vault_path)
        return (
            f"✅ *Note saved:* {note.title}\n"
            f"{note.summary}\n"
            f"_{entity_links}_\n"
            f"📁 `{rel_path}`"
        )

    def _detect_url(self, text: str) -> Optional[str]:
        """Extract the first HTTP/HTTPS URL from text, handling Slack's <URL> format.

        Slack wraps URLs as ``<url>`` or ``<url|display text>``. The angle
        brackets and pipe are excluded from the match so they don't get
        appended to the URL.

        Args:
            text: Input text to scan.

        Returns:
            First clean URL found, or None.
        """
        # Exclude Slack delimiters: angle brackets and pipe
        match = re.search(r'https?://[^\s<>|]+', text)
        if not match:
            return None
        return match.group(0).rstrip('.,;:')

    # Domains that block generic scrapers; try Jina first for these
    _JINA_FIRST_DOMAINS = {"mp.weixin.qq.com", "weixin.qq.com"}
    _MIN_CONTENT_LEN = 200

    def _fetch_url(self, url: str) -> str:
        """Fetch URL content using multiple strategies with automatic fallback.

        Strategy order:
        - WeChat / known blocked domains: direct js_content parse → Tavily → Playwright
        - All other URLs: Tavily → Jina → Playwright

        Args:
            url: URL to fetch.

        Returns:
            Extracted text content (>= _MIN_CONTENT_LEN chars).

        Raises:
            RuntimeError: If all strategies fail.
        """
        host_match = re.search(r'https?://([^/]+)', url)
        host = host_match.group(1) if host_match else ""

        if host in self._JINA_FIRST_DOMAINS:
            # WeChat: direct parse first, Playwright as last resort
            strategies = [self._fetch_direct_wechat, self._fetch_tavily, self._fetch_playwright]
        else:
            strategies = [self._fetch_tavily, self._fetch_jina, self._fetch_playwright]

        last_err: Exception = RuntimeError("No strategies available")
        for strategy in strategies:
            try:
                content = strategy(url)
                if len(content) >= self._MIN_CONTENT_LEN:
                    logger.info(f"Fetched {len(content)} chars via {strategy.__name__}")
                    return content
                logger.debug(f"{strategy.__name__} returned too little content ({len(content)} chars)")
            except Exception as e:
                logger.debug(f"{strategy.__name__} failed: {e}")
                last_err = e

        raise RuntimeError(f"所有抓取策略均失败: {last_err}")

    def _fetch_jina(self, url: str) -> str:
        """Fetch via Jina Reader (r.jina.ai) — works well for WeChat and most sites.

        Args:
            url: Target URL.

        Returns:
            Markdown text extracted by Jina.
        """
        import httpx

        jina_url = f"https://r.jina.ai/{url}"
        headers: dict = {"Accept": "text/plain", "X-Return-Format": "markdown"}
        jina_key = os.environ.get("JINA_API_KEY")
        if jina_key:
            headers["Authorization"] = f"Bearer {jina_key}"

        resp = httpx.get(jina_url, headers=headers, timeout=30, follow_redirects=True)
        resp.raise_for_status()
        return resp.text

    def _fetch_direct_wechat(self, url: str) -> str:
        """Directly fetch a WeChat public article by parsing js_content from HTML.

        WeChat public articles (mp.weixin.qq.com) embed the full body in
        <div id="js_content">. This method extracts title + body without
        needing a headless browser or external proxy.

        Args:
            url: WeChat article URL.

        Returns:
            Plain text of title + article body.
        """
        import httpx

        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/122.0.0.0 Safari/537.36"
            ),
            "Accept-Language": "zh-CN,zh;q=0.9",
        }
        resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
        resp.raise_for_status()
        html = resp.text

        parts: list[str] = []

        # Extract title
        title_m = re.search(
            r'class=["\'][^"\']*rich_media_title[^"\']*["\'][^>]*>(.*?)</h1>',
            html, re.DOTALL
        )
        if title_m:
            title_text = re.sub(r'<[^>]+>', '', title_m.group(1)).strip()
            if title_text:
                parts.append(f"# {title_text}")

        # Extract body: find js_content start, end at first known landmark
        start_m = re.search(r'id=["\']js_content["\']', html)
        if start_m:
            content_start = html.index('>', start_m.end()) + 1
            # Landmarks that mark the end of the article body in WeChat HTML
            end_markers = [
                'id="js_bottom_bar"',
                'id="js_related_posts"',
                'id="js_msg_card"',
                'id="js_sg_bar"',
                'id="js_pc_qr_code"',
                'class="qr_code_pc_outer"',
            ]
            content_end = len(html)
            for marker in end_markers:
                idx = html.find(marker, content_start)
                if 0 < idx < content_end:
                    content_end = idx
            body_html = html[content_start:content_end]
            body = re.sub(r'<[^>]+>', ' ', body_html)
            body = re.sub(r'&nbsp;', ' ', body)
            body = re.sub(r'&[a-zA-Z]+;', '', body)
            body = re.sub(r'\s+', ' ', body).strip()
            if body:
                parts.append(body)

        if not parts:
            # Fallback: strip all tags from full HTML
            text = re.sub(r'<[^>]+>', ' ', html)
            return re.sub(r'\s+', ' ', text).strip()

        return "\n\n".join(parts)

    def _fetch_tavily(self, url: str) -> str:
        """Fetch via Tavily extract API.

        Args:
            url: Target URL.

        Returns:
            Raw content string from Tavily.

        Raises:
            RuntimeError: If API key missing or no content returned.
        """
        from tavily import TavilyClient

        api_key = os.environ.get("TAVILY_API_KEY")
        if not api_key:
            raise RuntimeError("TAVILY_API_KEY is not set in environment")

        client = TavilyClient(api_key=api_key)
        result = client.extract(urls=[url])
        if result and result.get("results"):
            return result["results"][0].get("raw_content", "")
        raise RuntimeError("Tavily 未能提取到内容")

    def _fetch_playwright(self, url: str) -> str:
        """Fetch via headless Chromium with stealth mode (last resort).

        Renders the full page in a real browser, bypassing JS-heavy SPAs and
        most anti-bot fingerprinting. Slower (~3-5s) but most comprehensive.

        Args:
            url: Target URL.

        Returns:
            Visible body text extracted from the rendered page.

        Raises:
            ImportError: If playwright or playwright-stealth is not installed.
            RuntimeError: If the page loads but body text is too short.
        """
        try:
            from playwright.sync_api import sync_playwright
        except ImportError:
            raise ImportError(
                "playwright is not installed. Run: pip install playwright && "
                "playwright install chromium"
            )
        try:
            from playwright_stealth import Stealth
        except ImportError:
            raise ImportError(
                "playwright-stealth is not installed. Run: pip install playwright-stealth"
            )

        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            try:
                page = browser.new_page()
                Stealth().apply_stealth_sync(page)
                page.goto(url, timeout=30000, wait_until="domcontentloaded")
                # Allow JS rendering to settle
                page.wait_for_timeout(2000)
                content = page.inner_text("body").strip()
                logger.info(f"Playwright fetched {len(content)} chars from {url}")
                return content
            finally:
                browser.close()

    def _extract_with_llm(self, raw_text: str, source_url: Optional[str]) -> ExtractedNote:
        """Use LLM to extract structured note data from raw text.

        Args:
            raw_text: Raw content to process (truncated to 8000 chars).
            source_url: Optional source URL for context.

        Returns:
            Validated ExtractedNote model.

        Raises:
            ValueError: If LLM output cannot be parsed as valid JSON.
        """
        url_context = f"Source URL: {source_url}\n" if source_url else ""
        prompt = f"""You are a knowledge extraction assistant. Extract structured information from the text below.
{url_context}
=== TEXT ===
{raw_text[:8000]}

=== TASK ===
Return a JSON object with EXACTLY these fields:
- "title": A concise, descriptive title (max 80 chars)
- "summary": 2-3 sentence summary of the core content
- "key_points": Array of up to 7 key points (strings)
- "entities": Array of important people, concepts, projects, or tools mentioned (max 10)
- "cleaned_body": The full text, cleaned and reorganized. Wrap entity names in [[double brackets]] like [[entity name]].
- "language": "zh" if the content is primarily Chinese, "en" otherwise

CRITICAL:
- Output ONLY valid JSON, no markdown code blocks, no explanation
- Do NOT include tags like #writing_sample or #reply_sample
- Entities should be wrapped as [[entity]] in cleaned_body
"""
        response, _ = self.llm.generate_response(prompt, [])

        # Strip potential markdown fences
        response = response.strip()
        if response.startswith("```"):
            response = re.sub(r'^```(?:json)?\n?', '', response)
            response = re.sub(r'\n?```$', '', response)

        data = json.loads(response)
        return ExtractedNote.model_validate(data)

    def _fallback_note(self, raw_text: str, source_url: Optional[str]) -> ExtractedNote:
        """Create a minimal note when LLM extraction fails.

        Args:
            raw_text: Raw content.
            source_url: Optional source URL.

        Returns:
            Basic ExtractedNote with raw text as body.
        """
        title = raw_text[:60].replace('\n', ' ').strip() + "..."
        return ExtractedNote(
            title=title,
            summary=raw_text[:200],
            key_points=[],
            entities=[],
            cleaned_body=raw_text,
            source_url=source_url,
            language="zh",
        )

    def _write_md_file(self, note: ExtractedNote) -> Path:
        """Write note to vault as a Markdown file with YAML frontmatter.

        Args:
            note: Extracted note data.

        Returns:
            Path to the written file.
        """
        date_str = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d")
        slug = re.sub(r'[^\w\u4e00-\u9fff-]', '-', note.title)[:50].strip('-')
        filename = f"{date_str}_{slug}.md"
        file_path = self.notes_dir / filename

        source_line = f"source_url: \"{note.source_url}\"\n" if note.source_url else ""
        key_points_yaml = "\n".join(f"  - \"{kp}\"" for kp in note.key_points)
        ingested_at = datetime.now(tz=timezone.utc).isoformat()

        content = (
            f"---\n"
            f"title: \"{note.title}\"\n"
            f"date: {date_str}\n"
            f"tags:\n"
            f"  - auto_ingested\n"
            f"language: {note.language}\n"
            f"{source_line}"
            f"key_points:\n"
            f"{key_points_yaml}\n"
            f"ingested_at: {ingested_at}\n"
            f"---\n\n"
            f"## Summary\n\n"
            f"{note.summary}\n\n"
            f"## Content\n\n"
            f"{note.cleaned_body}\n"
        )

        file_path.write_text(content, encoding="utf-8")
        logger.info(f"Note written: {file_path}")
        return file_path

    def _chunk_and_index(self, note: ExtractedNote, file_path: Path) -> None:
        """Split note body into chunks and index them in the vector store.

        Args:
            note: Extracted note data.
            file_path: Path to the written markdown file.
        """
        body = f"## Summary\n\n{note.summary}\n\n## Content\n\n{note.cleaned_body}"
        chunks = split_md_by_headers(body)

        if not chunks:
            # Fallback: single chunk from summary + start of body
            chunks = [(note.summary + "\n\n" + note.cleaned_body)[:1000]]

        ingested_at = datetime.now(tz=timezone.utc).isoformat()
        metadatas = [
            ChunkMetadata(
                source_path=str(file_path),
                chunk_index=i,
                header_hierarchy=(
                    chunk.split('\n')[0][:100] if chunk.startswith('#') else "body"
                ),
                note_title=note.title,
                tags=["auto_ingested"],
                ingested_at=ingested_at,
            )
            for i, chunk in enumerate(chunks)
        ]

        self.vector_store.add_chunks(chunks, metadatas)
        logger.info(f"Indexed {len(chunks)} chunks for note: {note.title}")
