"""Zhihu Hunter — vault scanning, question discovery, and answer drafting.

Workflow:
    1. scan_and_hunt()  →  scans recent vault files, extracts keywords,
                           searches Tavily for matching Zhihu questions.
    2. draft_answer()   →  uses existing RAG + ZhihuGenerator to produce
                           a Zhihu-style answer draft ready for review.
"""

import os
import re
import requests
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import List, Optional

from pydantic import BaseModel, Field

from health.utils.logging_config import setup_logger
from slack_bot.llm.gemini import GeminiLLM
from slack_bot.obsidian.generators import ZhihuGenerator
from slack_bot.obsidian.indexer import ObsidianIndexer
from slack_bot.obsidian.vector_store import ChromaVectorStore

logger = setup_logger(__name__)


# ── Vault directories to watch for new/updated content ────────────────────────
# Paths are relative to vault_path root.
WATCH_DIRS: List[str] = [
    "notes",
    "Article",
    "auto_report/deep-dive",
    "CLAW/deepfact",
    "Clippings",
]

# Article is always included regardless of time; Revised files are prioritized.
_ARTICLE_DIR = "Article"

# Short names users can type to target a specific directory.
DIR_SHORTCUTS: dict[str, str] = {
    "notes":      "notes",
    "article":    "Article",
    "clippings":  "Clippings",
    "claw":       "CLAW/deepfact",
    "deepfact":   "CLAW/deepfact",
    "deep-dive":  "auto_report/deep-dive",
}

_MAX_FILES_PER_DIR   = 4    # most-recent files per directory (keeps dir diversity)
_MAX_FILES_ARTICLE   = 8    # Article gets more slots (higher priority)
_MAX_KEYWORDS        = 10   # keywords extracted from vault scan
_MAX_QUESTIONS_TOTAL = 12   # total Zhihu questions surfaced per hunt
# Must contain /question/<id>; answer/comment subpages are excluded in _find_questions
_MIN_QUESTION_URL_RE = re.compile(r'zhihu\.com/question/\d+')


# ── Data Models ────────────────────────────────────────────────────────────────

class ZhihuQuestion(BaseModel):
    """A Zhihu question candidate surfaced by the hunter."""

    title: str
    url: str
    snippet: Optional[str] = None
    keywords: List[str] = Field(default_factory=list)
    found_at: str = Field(
        default_factory=lambda: datetime.now(tz=timezone.utc).isoformat()
    )
    description: Optional[str] = None   # question body/detail text from Zhihu page
    source_file: Optional[str] = None  # vault-relative path, set by scan_single_file()
    outline: Optional[str] = None       # user-provided answer logic/structure (direct URL mode)


class AnswerDraft(BaseModel):
    """A generated answer draft awaiting human review."""

    question: ZhihuQuestion
    content: str
    generated_at: str = Field(
        default_factory=lambda: datetime.now(tz=timezone.utc).isoformat()
    )
    rag_sources: List[str] = Field(default_factory=list)
    vault_file: Optional[str] = None  # absolute path set after saving to vault


# ── Main Class ─────────────────────────────────────────────────────────────────

class ZhihuHunter:
    """Discovers Zhihu questions from vault content and drafts answers.

    Example:
        hunter = ZhihuHunter(vault_path, vector_store, indexer)
        questions = hunter.scan_and_hunt(since_days=3)
        draft = hunter.draft_answer(questions[0])
    """

    def __init__(
        self,
        vault_path: Path,
        vector_store: ChromaVectorStore,
        indexer: ObsidianIndexer,
    ) -> None:
        """Initialize the hunter.

        Args:
            vault_path: Root path of the Obsidian vault.
            vector_store: ChromaDB vector store for semantic retrieval.
            indexer: ObsidianIndexer for keyword-based search and writing samples.
        """
        self.vault_path = vault_path
        self.vector_store = vector_store
        self.indexer = indexer
        self.llm = GeminiLLM()

    # ── Public API ─────────────────────────────────────────────────────────────

    def scan_and_hunt(
        self,
        since_days: int = 7,
        dirs: Optional[List[str]] = None,
        topic_hint: str = "",
    ) -> List[ZhihuQuestion]:
        """Scan vault files and surface matching Zhihu questions.

        Args:
            since_days: Age threshold for non-Article directories.
            dirs: Restrict scan to these vault dirs (relative paths). None = all WATCH_DIRS.
            topic_hint: Optional focus hint injected into keyword extraction prompt.

        Returns:
            List of ZhihuQuestion candidates (up to _MAX_QUESTIONS_TOTAL).
        """
        scan_desc = f"dirs={dirs or 'all'}" + (f", topic='{topic_hint}'" if topic_hint else "")
        logger.info(f"ZhihuHunter: scanning vault ({scan_desc})")

        keywords = self._scan_vault_for_keywords(since_days, dirs=dirs, topic_hint=topic_hint)
        if not keywords:
            logger.info("No keywords extracted from vault files")
            return []

        logger.info(f"Extracted keywords: {keywords}")
        questions = self._find_questions(keywords)
        logger.info(f"Found {len(questions)} Zhihu questions")
        return questions

    def scan_single_file(self, rel_path: str) -> List[ZhihuQuestion]:
        """Extract keywords from one vault file and find matching Zhihu questions.

        The returned questions carry ``source_file`` set to ``rel_path`` so that
        ``draft_answer()`` will ground the generated answer in that article.

        Args:
            rel_path: Path relative to vault_path (e.g. "Article/8-xxx-Revised.md").

        Returns:
            List of ZhihuQuestion candidates with source_file populated.

        Raises:
            FileNotFoundError: If the file does not exist under the vault root.
        """
        file_path = self.vault_path / rel_path
        if not file_path.exists():
            raise FileNotFoundError(f"文件不存在：{file_path}")

        text = file_path.read_text(encoding="utf-8", errors="ignore")
        text = re.sub(r'^---\n.*?\n---\n', '', text, flags=re.DOTALL).strip()
        logger.info(f"scan_single_file: {rel_path} ({len(text)} chars)")

        keywords = self._extract_keywords_with_llm(text[:5000])
        if not keywords:
            logger.info("No keywords extracted from single file")
            return []

        logger.info(f"Keywords from file: {keywords}")
        questions = self._find_questions(keywords)

        for q in questions:
            q.source_file = rel_path

        return questions

    def scan_glob_files(self, pattern: str) -> List[ZhihuQuestion]:
        """Find vault files matching a glob pattern and hunt Zhihu questions.

        All matched files are combined for keyword extraction, so the questions
        reflect the full set of content.  The most-recently-modified matched file
        is stored as ``source_file`` to ground the answer draft.

        Args:
            pattern: Glob pattern relative to vault_path
                     (e.g. "Article/*openclaw*-Revised.md").

        Returns:
            List of ZhihuQuestion candidates with source_file set.

        Raises:
            FileNotFoundError: If no files match the pattern.
        """
        matched = sorted(
            self.vault_path.glob(pattern),
            key=lambda p: p.stat().st_mtime,
            reverse=True,
        )
        if not matched:
            raise FileNotFoundError(f"没有文件匹配：{pattern}")

        names = [p.name for p in matched]
        logger.info(f"scan_glob_files: '{pattern}' → {len(matched)} files: {names}")

        # Combine content from all matched files (per-file budget so no one file
        # dominates) and extract keywords from the aggregate.
        parts: List[str] = []
        per_file_budget = 1500 if len(matched) == 1 else 800
        for fp in matched:
            try:
                text = fp.read_text(encoding="utf-8", errors="ignore")
                text = re.sub(r'^---\n.*?\n---\n', '', text, flags=re.DOTALL).strip()
                parts.append(f"[{fp.name}]\n{text[:per_file_budget]}")
            except OSError as e:
                logger.debug(f"Could not read {fp}: {e}")

        combined = "\n\n---\n\n".join(parts)
        keywords = self._extract_keywords_with_llm(combined[:8000])
        if not keywords:
            logger.info("No keywords extracted from glob-matched files")
            return []

        logger.info(f"Keywords from glob: {keywords}")
        questions = self._find_questions(keywords)

        # Primary source = most-recently-modified matched file
        primary = str(matched[0].relative_to(self.vault_path))
        for q in questions:
            q.source_file = primary

        return questions

    def hunt_direct_url(self, url: str) -> List[ZhihuQuestion]:
        """Create a question card directly from a Zhihu question URL.

        Used when the user pastes a URL they were invited to answer.
        Fetches the page title and description so the card displays readable info.

        Args:
            url: A zhihu.com/question/<id> URL.

        Returns:
            Single-element list containing the ZhihuQuestion.
        """
        clean_url = re.sub(r'[>\'")\].,;]+$', '', url).lstrip("<").split("?")[0].split("#")[0].rstrip("/")
        title, description = self._fetch_question_info(clean_url)
        logger.info(f"Direct URL hunt: {clean_url!r} → title={title!r}, desc={len(description or '')} chars")
        return [ZhihuQuestion(title=title, url=clean_url, description=description)]

    def _fetch_question_info(self, url: str) -> tuple[str, Optional[str]]:
        """Fetch the question title and description via Playwright.

        Falls back to a URL-derived label if Playwright is unavailable or fails.

        Args:
            url: Cleaned question URL.

        Returns:
            Tuple of (title, description). Description may be None.
        """
        state_file = Path(__file__).resolve().parents[3] / "data" / "zhihu_state.json"
        try:
            from playwright.sync_api import sync_playwright
            from playwright_stealth import Stealth
            import json as _json

            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                context = browser.new_context(
                    user_agent=(
                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/122.0.0.0 Safari/537.36"
                    )
                )
                # Load saved cookies if available
                if state_file.exists():
                    data = _json.loads(state_file.read_text(encoding="utf-8"))
                    cookies = data.get("cookies", [])
                    if cookies:
                        context.add_cookies(cookies)

                page = context.new_page()
                Stealth().apply_stealth_sync(page)
                page.goto(url, wait_until="domcontentloaded", timeout=15000)
                page.wait_for_timeout(2000)

                # Extract title
                title = ""
                title_el = page.query_selector("h1.QuestionHeader-title")
                if title_el:
                    title = title_el.inner_text().strip()
                if not title:
                    title = page.title() or ""
                    title = title.split(" - 知乎")[0].strip()
                # Strip "(XX 条消息)" notification prefix from tab title
                title = re.sub(r'^\(\d+ 条消息\)\s*', '', title)

                # Extract question description
                description = None
                desc_el = page.query_selector("div.QuestionRichText")
                if desc_el:
                    description = desc_el.inner_text().strip()
                    # Clean up truncation artifacts and zero-width chars
                    description = description.replace("\u200b", "").strip()
                    description = re.sub(r'…?\s*显示全部\s*$', '', description).strip()
                if not description:
                    og_el = page.query_selector('meta[property="og:description"]')
                    if og_el:
                        description = (og_el.get_attribute("content") or "").strip()
                        description = re.sub(r'…$', '', description).strip()
                # Filter out Zhihu generic site description (not question-specific)
                if description and "知乎，中文互联网" in description:
                    description = None
                if description:
                    description = description[:2000]
                else:
                    description = None

                browser.close()

                if title and title != "知乎":
                    return title, description
        except Exception as e:
            logger.debug(f"Playwright question fetch failed for {url}: {e}")

        # Fallback: use the question ID
        qid = url.rstrip("/").split("/")[-1]
        return f"知乎问题 #{qid}", None

    def _fetch_recent_web_context(self, query: str) -> str:
        """Search the web for recent articles/news about the query topic.

        Results are injected into the answer prompt so it reads like it
        comes from someone who's actively following the latest developments.

        Args:
            query: Topic or question title to search.

        Returns:
            Formatted string of recent snippets, or empty string on failure.
        """
        api_key = os.environ.get("TAVILY_API_KEY")
        if not api_key:
            return ""
        try:
            from tavily import TavilyClient
            client = TavilyClient(api_key=api_key)
            results = client.search(
                query=query,
                search_depth="basic",
                max_results=4,
                days=90,
                exclude_domains=["zhihu.com"],
            )
            items = results.get("results", [])
            if not items:
                return ""
            parts = [
                f"- {r['title']} ({r['url'].split('/')[2]}): {r.get('content', '')[:250]}"
                for r in items
            ]
            return "\n".join(parts)
        except Exception as e:
            logger.debug(f"Recent web context fetch failed: {e}")
            return ""

    def draft_answer(self, question: ZhihuQuestion, feedback: str = "") -> AnswerDraft:
        """Generate a Zhihu-style answer draft for a question.

        Delegates to the existing ZhihuGenerator (reuses prompt + style guide).
        The RAG context is enriched by both semantic (ChromaDB) and
        keyword-based (ObsidianIndexer) retrieval.

        Args:
            question: ZhihuQuestion to answer.
            feedback: Optional revision instructions from the user (e.g. "更口语化一些").

        Returns:
            AnswerDraft with generated content and source list.
        """
        logger.info(f"Drafting answer for: {question.title}" + (f" [feedback: {feedback[:40]}]" if feedback else ""))

        # Collect RAG sources for transparency
        rag_sources: List[str] = []

        semantic_results = self.vector_store.search_knowledge(question.title, top_k=3)
        for r in semantic_results:
            rag_sources.append(Path(r.source_path).name)

        keyword_results = self.indexer.search(question.title, limit=5)
        for note in keyword_results:
            first_line = note.split("\n")[0].strip("# ").strip()
            if first_line:
                rag_sources.append(first_line)

        # If the question was pinned to a specific source file, load it as the
        # primary article so the answer stays grounded in its viewpoints.
        primary_article = ""
        if question.source_file:
            fp = self.vault_path / question.source_file
            try:
                raw = fp.read_text(encoding="utf-8", errors="ignore")
                raw = re.sub(r'^---\n.*?\n---\n', '', raw, flags=re.DOTALL).strip()
                primary_article = raw[:4000]
                rag_sources.insert(0, Path(question.source_file).name)
                logger.info(f"Loaded primary article: {question.source_file} ({len(primary_article)} chars)")
            except OSError as e:
                logger.warning(f"Could not read source file {question.source_file}: {e}")

        # Delegate to existing ZhihuGenerator for consistent style
        generator = ZhihuGenerator(self.indexer, self.vector_store)
        prompt_input = question.title
        if question.description:
            prompt_input += f"\n\n问题详细描述：{question.description}"
        if question.snippet:
            prompt_input += f" | 背景：{question.snippet}"

        # User-provided outline takes highest priority — structure the answer strictly
        # around it, pulling matching content from the knowledge base for each point.
        if question.outline:
            prompt_input += (
                f"\n\n用户提供的回答思路（必须严格按照这个逻辑框架展开，"
                f"从知识库中找到对应的内容来支撑每个论点，不要自由发挥框架结构）：\n{question.outline}"
            )
            logger.info(f"Injected user outline: {question.outline[:60]!r}")

        if primary_article:
            prompt_input += (
                f"\n\n主要参考文章（请基于此文的观点和论据来回答，"
                f"可直接引用其中的分析和结论）：\n{primary_article}"
            )

        # Inject recent web context so the answer feels up-to-date
        recent_context = self._fetch_recent_web_context(question.title)
        if recent_context:
            prompt_input += (
                f"\n\n近期相关资讯（最近90天内的动态，用来补充最新工具/发布/讨论，"
                f"让回答更有时效感，自然地融入即可，不要刻意罗列）：\n{recent_context}"
            )
            logger.info(f"Injected recent web context ({len(recent_context)} chars)")

        if feedback:
            prompt_input += f"\n\n用户修改意见：{feedback}"

        content, _ = generator.chat(prompt_input, history=[])

        return AnswerDraft(
            question=question,
            content=content,
            rag_sources=list(dict.fromkeys(rag_sources)),  # deduplicate, keep order
        )

    # ── Vault Scanning ─────────────────────────────────────────────────────────

    def _scan_vault_for_keywords(
        self,
        since_days: int,
        dirs: Optional[List[str]] = None,
        topic_hint: str = "",
    ) -> List[str]:
        """Collect vault files and extract topic keywords via LLM.

        Args:
            since_days: Age threshold for non-Article directories.
            dirs: Dirs to scan; None = all WATCH_DIRS.
            topic_hint: Optional focus instruction for keyword extraction.

        Returns:
            Deduplicated list of topic keywords (up to _MAX_KEYWORDS).
        """
        cutoff = datetime.now(tz=timezone.utc) - timedelta(days=since_days)
        files = self._collect_recent_files(cutoff, dirs=dirs)

        if not files:
            return []

        # Article-only scans get a larger per-file budget (higher priority content)
        scan_dirs = dirs or WATCH_DIRS
        per_file_budget = 800 if scan_dirs == [_ARTICLE_DIR] else 500
        combined = self._read_files_summary(files, max_chars_per_file=per_file_budget)
        return self._extract_keywords_with_llm(combined, topic_hint=topic_hint)

    def _collect_recent_files(
        self,
        cutoff: datetime,
        dirs: Optional[List[str]] = None,
    ) -> List[Path]:
        """Walk vault directories and return candidate files.

        Article directory:
          - No time filter (always included regardless of cutoff)
          - Revised files sorted first, then by mtime descending
          - Up to _MAX_FILES_ARTICLE slots
        Other directories:
          - Only files modified after cutoff
          - Sorted by mtime descending
          - Up to _MAX_FILES_PER_DIR slots

        Args:
            cutoff: Freshness threshold for non-Article directories.
            dirs: Directories to scan (relative paths). None = all WATCH_DIRS.

        Returns:
            List of Path objects.
        """
        scan_dirs = dirs if dirs is not None else WATCH_DIRS
        collected: List[Path] = []

        for rel_dir in scan_dirs:
            watch_dir = self.vault_path / rel_dir
            if not watch_dir.is_dir():
                logger.debug(f"Watch dir not found, skipping: {watch_dir}")
                continue

            # (mtime, is_revised, path)
            candidates: List[tuple[float, bool, Path]] = []
            for md_file in watch_dir.rglob("*.md"):
                try:
                    mtime = md_file.stat().st_mtime
                    if rel_dir != _ARTICLE_DIR:
                        file_dt = datetime.fromtimestamp(mtime, tz=timezone.utc)
                        if file_dt < cutoff:
                            continue
                    is_revised = md_file.name.lower().endswith("-revised.md")
                    candidates.append((mtime, is_revised, md_file))
                except OSError:
                    continue

            if rel_dir == _ARTICLE_DIR:
                # Revised files first, then newest; give double slots
                candidates.sort(key=lambda t: (0 if t[1] else 1, -t[0]))
                max_files = _MAX_FILES_ARTICLE
            else:
                candidates.sort(key=lambda t: -t[0])
                max_files = _MAX_FILES_PER_DIR

            collected.extend(p for _, _, p in candidates[:max_files])

        logger.debug(f"Collected {len(collected)} files from vault (dirs={scan_dirs})")
        return collected

    def _read_files_summary(
        self,
        files: List[Path],
        max_chars: int = 8000,
        max_chars_per_file: int = 500,
    ) -> str:
        """Read and concatenate file contents up to a character budget.

        Each file is capped at max_chars_per_file so that no single large file
        exhausts the total budget and crowds out other directories.

        Args:
            files: List of markdown file paths.
            max_chars: Total character budget across all files.
            max_chars_per_file: Per-file cap to ensure directory diversity.

        Returns:
            Concatenated content string with file-name headers.
        """
        parts: List[str] = []
        remaining = max_chars

        for f in files:
            if remaining <= 0:
                break
            try:
                text = f.read_text(encoding="utf-8", errors="ignore")
                # Strip YAML frontmatter
                text = re.sub(r'^---\n.*?\n---\n', '', text, flags=re.DOTALL).strip()
                chunk = text[:min(max_chars_per_file, remaining)]
                parts.append(f"[{f.name}]\n{chunk}")
                remaining -= len(chunk)
            except OSError as e:
                logger.debug(f"Could not read {f}: {e}")

        return "\n\n---\n\n".join(parts)

    def _extract_keywords_with_llm(
        self,
        combined_text: str,
        topic_hint: str = "",
    ) -> List[str]:
        """Use LLM to distill topic keywords from aggregated vault content.

        Args:
            combined_text: Concatenated recent vault content.
            topic_hint: Optional focus instruction (e.g. "agent开发").

        Returns:
            List of keyword strings (up to _MAX_KEYWORDS).
        """
        topic_instruction = (
            f"\n- 重点关注与「{topic_hint}」相关的话题，优先提取这个方向的关键词"
            if topic_hint else ""
        )
        prompt = f"""你是一位知乎内容运营专家。请从以下笔记内容中提炼出适合在知乎搜索问题的关键词。

要求：
- 输出 {_MAX_KEYWORDS} 个以内的关键词，每行一个，无序号无符号
- 关键词要在知乎上有真实讨论热度，即普通中国用户会关心并提问的话题领域
- 避免过于偏门的专有名词（如具体项目代号、小众产品名），要用它所属的上一级概念
  例：「OpenClaw」→「AI智能体安全」；「BadJs」→「微信监控机制」；「Moon Monitor」→「健康监测设备」
- 不要太泛（如"AI""技术""健康"），也不要太窄（如只有一个工具代码名称）
- 优先选择最近讨论最多、最有深度的话题{topic_instruction}
- 只输出关键词列表，不要解释

=== 笔记内容 ===
{combined_text[:5000]}
"""
        try:
            response, _ = self.llm.generate_response(prompt, [])
            keywords = [
                line.strip()
                for line in response.strip().splitlines()
                if line.strip() and not line.strip().startswith("#")
            ]
            return keywords[:_MAX_KEYWORDS]
        except Exception as e:
            logger.error(f"Keyword extraction failed: {e}")
            return []

    # ── Question Discovery ─────────────────────────────────────────────────────

    def _find_questions(self, keywords: List[str]) -> List[ZhihuQuestion]:
        """Search Tavily for Zhihu questions matching each keyword.

        Collects candidates from ALL keywords before truncating so that later
        keywords (which can cover different facets) are not silently dropped.

        Args:
            keywords: List of topic keywords to search.

        Returns:
            Deduplicated ZhihuQuestion list (up to _MAX_QUESTIONS_TOTAL).
        """
        from tavily import TavilyClient

        api_key = os.environ.get("TAVILY_API_KEY")
        if not api_key:
            raise RuntimeError("TAVILY_API_KEY not set")

        client = TavilyClient(api_key=api_key)
        seen_urls: set[str] = set()
        questions: List[ZhihuQuestion] = []

        for kw in keywords:
            try:
                results = self._tavily_search(client, kw)
                for item in results:
                    url = item.get("url", "")
                    if not _MIN_QUESTION_URL_RE.search(url):
                        continue  # skip non-question URLs (e.g. zhihu.com/people/...)
                    # Normalize: extract bare URL to strip invisible chars, then remove params
                    _m = re.search(r'https?://\S+', url)
                    if not _m:
                        continue
                    clean_url = re.sub(r'[>\'")\].,;]+$', '', _m.group(0)).split("?")[0].split("#")[0].rstrip("/")
                    # Skip answer/comment subpages — we want the question itself
                    if re.search(r'/answer/|/comment/', clean_url):
                        continue
                    if clean_url in seen_urls:
                        continue
                    seen_urls.add(clean_url)
                    questions.append(
                        ZhihuQuestion(
                            title=item.get("title", kw),
                            url=clean_url,
                            snippet=item.get("content", "")[:200] or None,
                            keywords=[kw],
                        )
                    )
            except Exception as e:
                logger.warning(f"Tavily search failed for keyword '{kw}': {e}")

        logger.info(f"Found {len(questions)} Zhihu questions (capped at {_MAX_QUESTIONS_TOTAL})")
        return questions[:_MAX_QUESTIONS_TOTAL]

    def _tavily_search(self, client, keyword: str) -> List[dict]:
        """Run a single Tavily search scoped to zhihu.com/question.

        Args:
            client: Authenticated TavilyClient instance.
            keyword: Search keyword.

        Returns:
            List of result dicts from Tavily (url, title, content, score).
        """
        query = f"site:zhihu.com/question {keyword}"
        logger.debug(f"Tavily search: {query}")
        response = client.search(
            query=query,
            search_depth="basic",
            max_results=8,
            include_domains=["zhihu.com"],
            days=365,   # prefer questions with recent activity (higher chance of top rank)
        )
        return response.get("results", [])