
    &Ri                         d Z ddlZddlZddlmZmZ ddlmZ ddlmZm	Z	 ddl
mZmZ ddlmZ ddlmZmZ  ee          Z G d	 d
e          Z G d de          Zdedee         fdZ G d d          ZdS )z6ChromaDB vector store for the Second Brain RAG system.    N)datetimetimezone)Path)ListOptional)	BaseModelField)setup_logger)EmbeddingProviderget_embedding_providerc                       e Zd ZU dZeed<   eed<   eed<   dZee         ed<    e	e
          Zee         ed<   eed	<   dS )
ChunkMetadataz6Metadata for a document chunk stored in the vector DB.source_pathchunk_indexheader_hierarchyN
note_title)default_factorytagsingested_at)__name__
__module____qualname____doc__str__annotations__intr   r   r	   listr   r        8/root/projects/butler/slack_bot/obsidian/vector_store.pyr   r      s}         @@ $J$$$eD111D$s)111r   r   c                   V    e Zd ZU dZeed<   eed<   eed<   eed<   dZee         ed<   dS )SearchResultz A single semantic search result.textr   r   distanceNr   )	r   r   r   r   r   r   floatr   r   r   r   r    r"   r"      sV         **
IIIOOO $J$$$$$r   r"   bodyreturnc                 D    t          j        d|           }d |D             S )zSplit a markdown body by H2/H3 headers into chunks.

    Args:
        body: Markdown text to split.

    Returns:
        List of non-empty chunks of at least 50 characters.
    z\n(?=#{2,3} )c                     g | ];}t          |                                          d k    '|                                <S )2   )lenstrip).0chunks     r    
<listcomp>z'split_md_by_headers.<locals>.<listcomp>0   s:    NNNeS5G5G25M5MEKKMM5M5M5Mr   )resplit)r&   
raw_chunkss     r    split_md_by_headersr3   &   s*     *D11JNNzNNNNr   c                       e Zd ZdZdZ	 	 ddee         dee         ddfdZde	d	e
de	fd
Zdee	         dee         ddfdZdde	de
dee         fdZddZdefdZdS )ChromaVectorStorez0ChromaDB-backed vector store for Obsidian notes.obsidian_notesNembedding_providerdb_pathr'   c                    |t                      }|| _        |ddlm} |j        dz  }|                    dd           || _        	 ddl}|                    t          |                    | _
        | j
                            | j        dd	i
          | _        t                              d| d| j         d           dS # t           $ r t!          d          w xY w)a$  Initialize the ChromaDB vector store.

        Args:
            embedding_provider: Provider for generating embeddings. Defaults to
                get_embedding_provider().
            db_path: Path to ChromaDB persistence directory. Defaults to
                DATA_DIR/vector_db.
        Nr   )config	vector_dbT)parentsexist_ok)path
hnsw:spacecosinenamemetadataz!ChromaVectorStore initialized at z (collection: )z;chromadb is not installed. Run: pip install chromadb>=0.5.0)r   r7   healthr:   DATA_DIRmkdirr8   chromadbPersistentClientr   _clientget_or_create_collectionCOLLECTION_NAME_collectionloggerinfoImportError)selfr7   r8   r:   rH   s        r    __init__zChromaVectorStore.__init__8   s1    %!7!9!9"4?%%%%%%o3GdT222	]OOO#44#g,,4GGDL#|DD)&1  E    D KK8G 8 8 $ 48 8 8      	] 	] 	][\\\	]s   	A:C Cr   r   c                     | d| }t          j        |                                                                          dd         S )a  Generate a deterministic 16-char document ID for idempotent upsert.

        Args:
            source_path: Path to the source file.
            chunk_index: Index of the chunk within the file.

        Returns:
            16-character hex string.
        :N   )hashlibsha256encode	hexdigest)rQ   r   r   raws       r    _make_doc_idzChromaVectorStore._make_doc_id^   sD     ,,{,,~cjjll++5577<<r   texts	metadatasc                     |sdS  fd|D             } j                             |          }g }|D ]I}|                                }d                    |d                   |d<   |                    |           J j                            ||||           t                              dt          |           d           dS )zAdd or update chunks in the vector store (idempotent upsert).

        Args:
            texts: List of text chunks.
            metadatas: Corresponding ChunkMetadata for each chunk.
        Nc                 P    g | ]"}                     |j        |j                  #S r   )r[   r   r   )r-   mrQ   s     r    r/   z0ChromaVectorStore.add_chunks.<locals>.<listcomp>u   s-    VVVq4$$Q]AMBBVVVr   ,r   )ids
embeddings	documentsr]   z	Upserted z chunks into vector store)
r7   embed
model_dumpjoinappendrM   upsertrN   debugr+   )rQ   r\   r]   doc_idsrc   
meta_dictsr`   ds   `       r    
add_chunkszChromaVectorStore.add_chunksk   s      	FVVVVIVVV,22599
 
 	! 	!AA6++AfIa    ! 	 	  	
 	
 	
 	FUFFFGGGGGr      querytop_kc                 V   | j                                         }|dk    rg S | j                            |g          d         }| j                             |gt          ||          g d          }g }|d         r|d         d         s|S t          |d         d         |d         d         |d         d                   D ]h\  }}}	|                    t          ||	                    dd	          |	                    d
d	          |	|	                    d                               i|S )zSemantic search over stored chunks.

        Args:
            query: Search query text.
            top_k: Maximum number of results to return.

        Returns:
            List of SearchResult ordered by relevance (lowest distance first).
        r   )rd   r]   	distances)query_embeddings	n_resultsincluderb   rd   r]   rs   r    r   r   )r#   r   r   r$   r   )
rM   countr7   re   rp   minziprh   r"   get)
rQ   rp   rq   totalquery_embeddingresultssearch_resultsdocmetadists
             r    search_knowledgez"ChromaVectorStore.search_knowledge   s\     &&((A::I177@@C"((-.%'';;; ) 
 
 .0u~ 	"WU^A%6 	"!!"K #K #K # 
  
 	 	OCt
 !! $ ; ;%)XX.@"%E%E!#xx55      r   c                     | j                             | j                   | j                             | j        ddi          | _        t
                              d| j         d           dS )z=Delete and recreate the collection, removing all stored data.r?   r@   rA   zCollection 'z' resetN)rJ   delete_collectionrL   rK   rM   rN   rO   rQ   s    r    resetzChromaVectorStore.reset   sr    &&t';<<<<@@%"H- A 
 
 	@4#7@@@AAAAAr   c                 j    | j         | j                                        t          | j                  dS )zReturn basic collection statistics.

        Returns:
            Dict with collection_name, total_chunks, and db_path.
        )collection_nametotal_chunksr8   )rL   rM   rx   r   r8   r   s    r    	get_statszChromaVectorStore.get_stats   s8      $3 ,22444<((
 
 	
r   )NN)ro   )r'   N)r   r   r   r   rL   r   r   r   rR   r   r   r[   r   r   rn   r"   r   r   dictr   r   r   r    r5   r5   3   s8       ::&O ;?"&$] $]$%67$] $$] 
	$] $] $] $]L= =# =# = = = =HS	 Hd=6I Hd H H H H8( (c (# (d<>P ( ( ( (TB B B B

4 

 

 

 

 

 

r   r5   )r   rV   r0   r   r   pathlibr   typingr   r   pydanticr   r	   health.utils.logging_configr
   slack_bot.obsidian.embeddingsr   r   r   rN   r   r"   r   r3   r5   r   r   r    <module>r      sl   < <  				 ' ' ' ' ' ' ' '       ! ! ! ! ! ! ! ! % % % % % % % % 4 4 4 4 4 4 S S S S S S S S	h		    I   % % % % %9 % % %
Oc 
Od3i 
O 
O 
O 
OQ
 Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
r   