
    ٦i?                         d Z ddlZddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZmZ  ee          Z G d
 de          Z G d d          ZdS )z6Note ingestion engine for the Second Brain RAG system.    N)datetimetimezone)Path)ListOptional)	BaseModel)setup_logger)	GeminiLLM)ChunkMetadataChromaVectorStoresplit_md_by_headersc                       e Zd ZU dZeed<   eed<   ee         ed<   ee         ed<   eed<   dZee         ed<   d	Z	eed
<   dS )ExtractedNotez2LLM-extracted structured note from raw input text.titlesummary
key_pointsentitiescleaned_bodyN
source_urlzhlanguage)
__name__
__module____qualname____doc__str__annotations__r   r   r   r        9/root/projects/butler/slack_bot/obsidian/note_ingester.pyr   r      sz         <<JJJLLLS	3i $J$$$Hcr   r   c                      e Zd ZdZdededdfdZdedefdZd	ede	e         fd
Z
ddhZdZdedefdZdedefdZdedefdZdedefdZdedefdZdede	e         defdZdede	e         defdZdedefdZdededdfdZdS )NoteIngesterzBIngests raw text or URLs into the Obsidian vault and vector store.
vault_pathvector_storereturnNc                     || _         |dz  | _        | j                            dd           || _        t	                      | _        dS )zInitialize the note ingester.

        Args:
            vault_path: Root path of the Obsidian vault.
            vector_store: ChromaDB vector store for indexing chunks.
        notesT)parentsexist_okN)r#   	notes_dirmkdirr$   r
   llm)selfr#   r$   s      r    __init__zNoteIngester.__init__"   sI     %#g-TD999(;;r   
user_inputc           	         |                      |          }d}|rrt                              d| d           	 |                     |          }|}n=# t          $ r.}t                              d| d           d| dcY d}~S d}~ww xY w|}	 |                     ||          }n:# t          $ r-}t                              d|            d	| d
cY d}~S d}~ww xY w|                     |          }| 	                    ||           d
                    d |j        dd         D                       }|                    | j                  }	d|j         d|j         d| d|	 d	S )zIngest text or a URL into the vault and vector store.

        Args:
            user_input: Raw text or a URL to ingest.

        Returns:
            Slack-formatted confirmation message.
        NzDetected URL: z. Fetching content...zURL fetch failed ()u+   ⚠️ 无法自动抓取该链接内容（uM   ）

请将文章正文直接粘贴发送给我，我来帮你保存入库。zLLM extraction failed: u/   ❌ 内容提取失败（LLM 解析出错）：u2   

请重试，或检查原始内容是否正常。 c              3   "   K   | ]
}d | dV  dS )z[[z]]Nr   ).0es     r    	<genexpr>z&NoteIngester.ingest.<locals>.<genexpr>S   s*      FFq
Q


FFFFFFr      u   ✅ *Note saved:* 
z
_u   _
📁 ``)_detect_urlloggerinfo
_fetch_url	Exceptionwarning_extract_with_llmerror_write_md_file_chunk_and_indexjoinr   relative_tor#   r   r   )
r-   r/   urlr   raw_textr5   note	file_pathentity_linksrel_paths
             r    ingestzNoteIngester.ingest/   s	    z**$(
 	"KKCCCCDDD	??3// 

   8A888999`! ` ` `      "H	}))(J??DD 	} 	} 	}LL6166777|Q|||||||||	} ''--	dI...xxFFDM"1"4EFFFFF((99! ! !|! !! ! ! ! !	
s;   A 
B	#B>B	B	B& &
C0"CCCtextc                     t          j        d|          }|sdS |                    d                              d          S )aw  Extract the first HTTP/HTTPS URL from text, handling Slack's <URL> format.

        Slack wraps URLs as ``<url>`` or ``<url|display text>``. The angle
        brackets and pipe are excluded from the match so they don't get
        appended to the URL.

        Args:
            text: Input text to scan.

        Returns:
            First clean URL found, or None.
        zhttps?://[^\s<>|]+Nr   z.,;:)researchgrouprstrip)r-   rM   matchs      r    r:   zNoteIngester._detect_url\   sA     	/66 	4{{1~~$$V,,,r   zmp.weixin.qq.comzweixin.qq.com   rF   c                    t          j        d|          }|r|                    d          nd}|| j        v r| j        | j        | j        g}n| j        | j        | j        g}t          d          }|D ]}	  ||          }t          |          | j
        k    r6t                              dt          |           d|j                    |c S t                              |j         dt          |           d           # t          $ r0}t                              |j         d	|            |}Y d
}~d
}~ww xY wt          d|           )u  Fetch URL content using multiple strategies with automatic fallback.

        Strategy order:
        - WeChat / known blocked domains: direct js_content parse → Tavily → Playwright
        - All other URLs: Tavily → Jina → Playwright

        Args:
            url: URL to fetch.

        Returns:
            Extracted text content (>= _MIN_CONTENT_LEN chars).

        Raises:
            RuntimeError: If all strategies fail.
        zhttps?://([^/]+)    zNo strategies availablezFetched z chars via z returned too little content (z chars)z	 failed: Nu   所有抓取策略均失败: )rO   rP   rQ   _JINA_FIRST_DOMAINS_fetch_direct_wechat_fetch_tavily_fetch_playwright_fetch_jinaRuntimeErrorlen_MIN_CONTENT_LENr;   r<   r   debugr>   )	r-   rF   
host_matchhost
strategieslast_errstrategycontentr5   s	            r    r=   zNoteIngester._fetch_urls   s     Y2C88
&08z"""b4+++3T5GI_`JJ,d.>@VWJ*+DEE" 		 		H"(3--w<<4#888KK W3w<< W WHDU W WXXX"NNN 1ffQTU\Q]Q]fffgggg    1??A??@@@ E8EEFFFs   5AD2D
D;&D66D;c                     ddl }d| }ddd}t          j                            d          }|rd| |d	<   |                    ||d
d          }|                                 |j        S )u   Fetch via Jina Reader (r.jina.ai) — works well for WeChat and most sites.

        Args:
            url: Target URL.

        Returns:
            Markdown text extracted by Jina.
        r   Nzhttps://r.jina.ai/z
text/plainmarkdown)AcceptzX-Return-FormatJINA_API_KEYzBearer Authorization   Theaderstimeoutfollow_redirects)httpxosenvirongetraise_for_statusrM   )r-   rF   rq   jina_urlrn   jina_keyresps          r    r\   zNoteIngester._fetch_jina   s     	---#/JOO:>>.11 	<';';';GO$yy7BQUyVVyr   c                 2   ddl }ddd}|                    ||dd          }|                                 |j        }g }t	          j        d	|t          j                  }|rUt	          j        d
d|                    d                    	                                }|r|
                    d|            t	          j        d|          }	|	r|                    d|	                                          dz   }
g d}t          |          }|D ]*}|                    ||
          }d|cxk     r|k     rn (|}+||
|         }t	          j        d
d|          }t	          j        dd|          }t	          j        dd|          }t	          j        dd|          	                                }|r|
                    |           |s>t	          j        d
d|          }t	          j        dd|          	                                S d                    |          S )a  Directly fetch a WeChat public article by parsing js_content from HTML.

        WeChat public articles (mp.weixin.qq.com) embed the full body in
        <div id="js_content">. This method extracts title + body without
        needing a headless browser or external proxy.

        Args:
            url: WeChat article URL.

        Returns:
            Plain text of title + article body.
        r   NzoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36zzh-CN,zh;q=0.9)z
User-AgentzAccept-Language   Trm   z>class=["\'][^"\']*rich_media_title[^"\']*["\'][^>]*>(.*?)</h1>z<[^>]+>rW   rV   z# zid=["\']js_content["\']>)zid="js_bottom_bar"zid="js_related_posts"zid="js_msg_card"zid="js_sg_bar"zid="js_pc_qr_code"zclass="qr_code_pc_outer"r2   z&nbsp;z&[a-zA-Z]+;z\s+

)rq   rt   ru   rM   rO   rP   DOTALLsubrQ   stripappendindexendr^   findrD   )r-   rF   rq   rn   rx   htmlpartstitle_m
title_textstart_mcontent_startend_markerscontent_endmarkeridx	body_htmlbodyrM   s                     r    rY   z!NoteIngester._fetch_direct_wechat   sC    	1  0
 
 yygrDyQQy )M")
 
  	0
Ba0@0@AAGGIIJ 0.*../// )6== 	# JJsGKKMM::Q>M  K d))K% & &ii66s(((([((((("%K];67I6*c955D6)S$//D6."d33D6&#t,,2244D #T""" 	56*c400D6&#t,,22444{{5!!!r   c                 D   ddl m} t          j                            d          }|st          d           ||          }|                    |g          }|r7|                    d          r"|d         d                             dd	          S t          d
          )zFetch via Tavily extract API.

        Args:
            url: Target URL.

        Returns:
            Raw content string from Tavily.

        Raises:
            RuntimeError: If API key missing or no content returned.
        r   )TavilyClientTAVILY_API_KEYz(TAVILY_API_KEY is not set in environment)api_key)urlsresultsraw_contentrW   u   Tavily 未能提取到内容)tavilyr   rr   rs   rt   r]   extract)r-   rF   r   r   clientresults         r    rZ   zNoteIngester._fetch_tavily   s     	('''''*..!122 	KIJJJg...cU++ 	?fjj++ 	?)$Q'++M2>>>9:::r   c                    	 ddl m} n# t          $ r t          d          w xY w	 ddlm} n# t          $ r t          d          w xY w |            5 }|j                            d          }	 |                                } |                                |           |	                    |dd	
           |
                    d           |                    d                                          }t                              dt          |           d|            ||                                 cddd           S # |                                 w xY w# 1 swxY w Y   dS )a  Fetch via headless Chromium with stealth mode (last resort).

        Renders the full page in a real browser, bypassing JS-heavy SPAs and
        most anti-bot fingerprinting. Slower (~3-5s) but most comprehensive.

        Args:
            url: Target URL.

        Returns:
            Visible body text extracted from the rendered page.

        Raises:
            ImportError: If playwright or playwright-stealth is not installed.
            RuntimeError: If the page loads but body text is too short.
        r   )sync_playwrightzWplaywright is not installed. Run: pip install playwright && playwright install chromium)StealthzHplaywright-stealth is not installed. Run: pip install playwright-stealthT)headlessi0u  domcontentloaded)ro   
wait_untili  r   zPlaywright fetched z chars from N)playwright.sync_apir   ImportErrorplaywright_stealthr   chromiumlaunchnew_pageapply_stealth_syncgotowait_for_timeout
inner_textr   r;   r<   r^   close)r-   rF   r   r   pbrowserpagerf   s           r    r[   zNoteIngester._fetch_playwright  s    	;;;;;;; 	 	 	.  	
	2222222 	 	 	Z  	
 _ 	 !j'''66G
 ''))		,,T222		#u9K	LLL%%d+++//&117799Q#g,,QQCQQRRR	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 s<   	 #. AE2B3E%EEEE#&E#rG   r   c                 z   |rd| dnd}d| d|dd          d}| j                             |g           \  }}|                                }|                    d	          r,t	          j        d
d|          }t	          j        dd|          }t          j        |          }t          	                    |          S )ac  Use LLM to extract structured note data from raw text.

        Args:
            raw_text: Raw content to process (truncated to 8000 chars).
            source_url: Optional source URL for context.

        Returns:
            Validated ExtractedNote model.

        Raises:
            ValueError: If LLM output cannot be parsed as valid JSON.
        zSource URL: r8   rW   z^You are a knowledge extraction assistant. Extract structured information from the text below.
z
=== TEXT ===
Ni@  a  

=== TASK ===
Return a JSON object with EXACTLY these fields:
- "title": A concise, descriptive title (max 80 chars)
- "summary": 2-3 sentence summary of the core content
- "key_points": Array of up to 7 key points (strings)
- "entities": Array of important people, concepts, projects, or tools mentioned (max 10)
- "cleaned_body": The full text, cleaned and reorganized. Wrap entity names in [[double brackets]] like [[entity name]].
- "language": "zh" if the content is primarily Chinese, "en" otherwise

CRITICAL:
- Output ONLY valid JSON, no markdown code blocks, no explanation
- Do NOT include tags like #writing_sample or #reply_sample
- Entities should be wrapped as [[entity]] in cleaned_body
z```z^```(?:json)?\n?z\n?```$)
r,   generate_responser   
startswithrO   r~   jsonloadsr   model_validate)r-   rG   r   url_contextpromptresponse_datas           r    r@   zNoteIngester._extract_with_llm;  s     8BI3Z3333r  
%4%  & h00<<! >>##u%% 	8v12x@@Hvj"h77Hz(##++D111r   c           	          |dd                              dd                                          dz   }t          ||dd         g g ||d          S )	zCreate a minimal note when LLM extraction fails.

        Args:
            raw_text: Raw content.
            source_url: Optional source URL.

        Returns:
            Basic ExtractedNote with raw text as body.
        N<   r8   r2   z...rT   r   )r   r   r   r   r   r   r   )replacer   r   )r-   rG   r   r   s       r    _fallback_notezNoteIngester._fallback_noteg  sf     "%%dC0066885@TcTN!!
 
 
 	
r   rH   c                    t          j        t          j                                      d          }t          j        dd|j                  dd                             d          }| d| d}| j	        |z  }|j
        rd	|j
         d
nd}d                    d |j        D                       }t          j        t          j                                                  }d|j         d| d|j         d| d| d| d|j         d|j         d}	|                    |	d           t$                              d|            |S )zWrite note to vault as a Markdown file with YAML frontmatter.

        Args:
            note: Extracted note data.

        Returns:
            Path to the written file.
        tzz%Y-%m-%dz[^\w\u4e00-\u9fff-]-N2   r   z.mdzsource_url: "z"
rW   r8   c              3   "   K   | ]
}d | dV  dS )z  - ""Nr   )r4   kps     r    r6   z.NoteIngester._write_md_file.<locals>.<genexpr>  s*      #N#NOROOO#N#N#N#N#N#Nr   z---
title: "z"
date: z#
tags:
  - auto_ingested
language: zkey_points:
z
ingested_at: z
---

## Summary



## Content

zutf-8)encodingzNote written: )r   nowr   utcstrftimerO   r~   r   r   r*   r   rD   r   	isoformatr   r   r   
write_textr;   r<   )
r-   rH   date_strslugfilenamerI   source_linekey_points_yamlingested_atrf   s
             r    rB   zNoteIngester._write_md_file|  s    <8<00099*EEv,c4:>>ssCII#NN*****NX-	@DW<t<<<<UW))#N#Ndo#N#N#NNNlhl333==??%
% %% %
 % % % % % % (% % |% %  % % % 	$ 	Ww7770Y00111r   rI   c                    dj          dj         }t          |          }|sj         dz   j        z   dd         g}t          j        t
          j                                                  fdt          |          D             }| j	        
                    ||           t                              dt          |           d	j                    dS )
zSplit note body into chunks and index them in the vector store.

        Args:
            note: Extracted note data.
            file_path: Path to the written markdown file.
        z## Summary

r   r|   Ni  r   c           
          g | ]e\  }}t          t                    ||                    d           r#|                    d          d         dd         ndj        dg          fS )#r8   r   Nd   r   auto_ingested)source_pathchunk_indexheader_hierarchy
note_titletagsr   )r   r   r   splitr   )r4   ichunkrI   r   rH   s      r    
<listcomp>z1NoteIngester._chunk_and_index.<locals>.<listcomp>  s     
 
 
 5 	NN272B2B32G2GSEKK%%a(#..V:%&'	 	 	
 
 
r   zIndexed z chunks for note: )r   r   r   r   r   r   r   r   	enumerater$   
add_chunksr;   r<   r^   r   )r-   rH   rI   r   chunks	metadatasr   s    ``   @r    rC   zNoteIngester._chunk_and_index  s     TSS@QSS$T** 	J|f,t/@@%4%HIFlhl333==??
 
 
 
 
 
 &f--
 
 
	 	$$VY777Js6{{JJdjJJKKKKKr   )r   r   r   r   r   r   r.   r   rL   r   r:   rX   r_   r=   r\   rY   rZ   r[   r   r@   r   rB   rC   r   r   r    r"   r"      s	       LL4 7H T    +
 +
 +
 +
 +
 +
Z- - - - - -( .?%Gc %Gc %G %G %G %GNs s    *F" F" F" F" F" F"P; ; ; ; ; ;0* S * S *  *  *  * X*2# *28C= *2] *2 *2 *2 *2X
s 
 
- 
 
 
 
*&= &T & & & &PL] Lt L L L L L L Lr   r"   )r   r   rr   rO   r   r   pathlibr   typingr   r   pydanticr   health.utils.logging_configr	   slack_bot.llm.geminir
   slack_bot.obsidian.vector_storer   r   r   r   r;   r   r"   r   r   r    <module>r      s;   < <  				 				 ' ' ' ' ' ' ' '       ! ! ! ! ! ! ! !       4 4 4 4 4 4 * * * * * * a a a a a a a a a a	h			 	 	 	 	I 	 	 	cL cL cL cL cL cL cL cL cL cLr   