import os import shutil from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader from langchain_community.vectorstores import Chroma from langchain_community.embeddings import OllamaEmbeddings # Configuration SOURCE_DIRECTORY = "/data/rag_source" PERSIST_DIRECTORY = "/data/db" CHUNK_SIZE = 1000 CHUNK_OVERLAP = 200 def build_vector_store(): """ Builds the vector store from the documents in the source directory. """ print("Starting to build vector store...") # Clean up old database if os.path.exists(PERSIST_DIRECTORY): print(f"Removing old database from {PERSIST_DIRECTORY}") shutil.rmtree(PERSIST_DIRECTORY) # Load the documents print(f"Loading documents from {SOURCE_DIRECTORY}...") loader = DirectoryLoader(SOURCE_DIRECTORY, glob="**/*.*", show_progress=True, use_multithreading=True) documents = loader.load() if not documents: print("No documents found. Exiting.") return print(f"Loaded {len(documents)} documents.") # Split the documents into chunks print("Splitting documents into chunks...") text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) texts = text_splitter.split_documents(documents) print(f"Split documents into {len(texts)} chunks.") # Create the embeddings print("Creating embeddings...") embeddings = OllamaEmbeddings(model="nomic-embed-text", show_progress=True) # Create and persist the vector store print("Creating and persisting vector store...") db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY) print("Vector store created successfully.") db.persist() print("Vector store persisted.") if __name__ == "__main__": build_vector_store()