diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/utils.py b/ai-hub/app/core/vector_store/utils.py new file mode 100644 index 0000000..af2bc48 --- /dev/null +++ b/ai-hub/app/core/vector_store/utils.py @@ -0,0 +1,11 @@ +import faiss +import os + +def save_faiss_index(index, path: str): + if index: + faiss.write_index(index, path) + +def load_faiss_index(path: str): + if os.path.exists(path): + return faiss.read_index(path) + return None diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/utils.py b/ai-hub/app/core/vector_store/utils.py new file mode 100644 index 0000000..af2bc48 --- /dev/null +++ b/ai-hub/app/core/vector_store/utils.py @@ -0,0 +1,11 @@ +import faiss +import os + +def save_faiss_index(index, path: str): + if index: + faiss.write_index(index, path) + +def load_faiss_index(path: str): + if os.path.exists(path): + return faiss.read_index(path) + return None diff --git a/ai-hub/tests/core/test_services.py b/ai-hub/tests/core/test_services.py index a5d1191..d36c203 100644 --- a/ai-hub/tests/core/test_services.py +++ b/ai-hub/tests/core/test_services.py @@ -10,7 +10,8 @@ # Import the service and its dependencies from app.core.services import RAGService from app.db import models -from app.core.vector_store import FaissVectorStore, MockEmbedder +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder # Import FaissDBRetriever and a mock WebRetriever for testing different cases from app.core.retrievers import FaissDBRetriever, Retriever from app.core.pipelines.dspy_rag import DspyRagPipeline, DSPyLLMProvider @@ -227,7 +228,7 @@ @patch('app.db.models.VectorMetadata') @patch('app.db.models.Document') -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_success(mock_vector_store, mock_document_model, mock_vector_metadata_model): """ Test the RAGService.add_document method for a successful run. @@ -282,7 +283,7 @@ embedding_model="mock_embedder" # This now passes because the mock embedder is of type MockEmbedder ) -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_error_handling(mock_vector_store): """ Test the RAGService.add_document method's error handling. diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/utils.py b/ai-hub/app/core/vector_store/utils.py new file mode 100644 index 0000000..af2bc48 --- /dev/null +++ b/ai-hub/app/core/vector_store/utils.py @@ -0,0 +1,11 @@ +import faiss +import os + +def save_faiss_index(index, path: str): + if index: + faiss.write_index(index, path) + +def load_faiss_index(path: str): + if os.path.exists(path): + return faiss.read_index(path) + return None diff --git a/ai-hub/tests/core/test_services.py b/ai-hub/tests/core/test_services.py index a5d1191..d36c203 100644 --- a/ai-hub/tests/core/test_services.py +++ b/ai-hub/tests/core/test_services.py @@ -10,7 +10,8 @@ # Import the service and its dependencies from app.core.services import RAGService from app.db import models -from app.core.vector_store import FaissVectorStore, MockEmbedder +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder # Import FaissDBRetriever and a mock WebRetriever for testing different cases from app.core.retrievers import FaissDBRetriever, Retriever from app.core.pipelines.dspy_rag import DspyRagPipeline, DSPyLLMProvider @@ -227,7 +228,7 @@ @patch('app.db.models.VectorMetadata') @patch('app.db.models.Document') -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_success(mock_vector_store, mock_document_model, mock_vector_metadata_model): """ Test the RAGService.add_document method for a successful run. @@ -282,7 +283,7 @@ embedding_model="mock_embedder" # This now passes because the mock embedder is of type MockEmbedder ) -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_error_handling(mock_vector_store): """ Test the RAGService.add_document method's error handling. diff --git a/ai-hub/tests/core/test_vector_store.py b/ai-hub/tests/core/test_vector_store.py deleted file mode 100644 index 0af7e46..0000000 --- a/ai-hub/tests/core/test_vector_store.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import pytest -import numpy as np -import requests -import json -from unittest import mock -from unittest.mock import MagicMock - -from app.core.vector_store import FaissVectorStore, MockEmbedder, GenAIEmbedder, get_embedder_from_config -from app.config import EmbeddingProvider - -# Define a constant for the dimension to ensure consistency -TEST_DIMENSION = 768 - -# --- Fixtures --- - -@pytest.fixture -def temp_faiss_file(tmp_path): - """ - Provides a temporary file path for the FAISS index to ensure tests are isolated. - """ - test_dir = tmp_path / "faiss_test" - test_dir.mkdir() - return str(test_dir / "test_index.faiss") - -@pytest.fixture -def mock_embedder(): - """ - Creates a MockEmbedder instance with the correct dimension. - """ - return MockEmbedder(dimension=TEST_DIMENSION) - -@pytest.fixture -def mock_genai_embedder(): - """ - Mocks the GenAIEmbedder to avoid making real API calls. - It patches the synchronous requests.post call and returns a mock response. - """ - with mock.patch('requests.post') as mock_post: - # Configure the mock response object - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None # No exception on success - - # Define the JSON content that the mock response will return - embedding_data = np.random.rand(TEST_DIMENSION).tolist() - mock_response.json.return_value = { - "embedding": {"values": embedding_data} - } - mock_post.return_value = mock_response - - # Create an instance of the real GenAIEmbedder class, now with the dimension argument - embedder = GenAIEmbedder( - model_name="gemini-embedding-001", - api_key="mock_api_key_for_testing", - dimension=TEST_DIMENSION # FIX: Added the missing dimension argument - ) - yield embedder - -@pytest.fixture(params=[ - pytest.param('mock_embedder', id="MockEmbedder"), - pytest.param('mock_genai_embedder', id="GenAIEmbedder") -]) -def faiss_store(request, temp_faiss_file): - """ - Parametrized fixture to test FaissVectorStore with both embedders. - """ - embedder = request.getfixturevalue(request.param) - faiss_store_instance = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=embedder, - ) - yield faiss_store_instance - -# --- Test Cases --- - -def test_add_document(faiss_store: FaissVectorStore): - """ - Test the add_document method to ensure it adds a vector and saves the index. - """ - test_text = "This is a test document." - - # Assert that the index is initially empty - assert faiss_store.index.ntotal == 0 - - # Add a document and check the index size - faiss_id = faiss_store.add_document(test_text) - - assert faiss_store.index.ntotal == 1 - assert faiss_id == 0 - assert os.path.exists(faiss_store.index_file_path) - -def test_add_multiple_documents(faiss_store: FaissVectorStore): - """ - Test that multiple documents can be added and the index size grows correctly. - """ - docs = ["Doc 1", "Doc 2", "Doc 3"] - - assert faiss_store.index.ntotal == 0 - - faiss_ids = faiss_store.add_multiple_documents(docs) - - assert faiss_store.index.ntotal == 3 - assert len(faiss_ids) == 3 - assert faiss_ids == [0, 1, 2] - -def test_load_existing_index(temp_faiss_file, mock_embedder): - """ - Test that the store can load an existing index file from disk. - """ - # 1. Create a store and add a document to it - first_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - first_store.add_document("Document for persistence test.") - - # 2. Create a new store instance with the same file path - # This should load the existing index, not create a new one - second_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - - # 3. Assert that the second store has the data from the first - assert second_store.index.ntotal == 1 - assert second_store.doc_id_map == [0] - -def test_search_similar_documents(faiss_store: FaissVectorStore): - """ - Test search functionality with a mock and a real embedder, - verifying the format of the results. - """ - # Add documents to the store - faiss_store.add_document("The sun is a star.") - faiss_store.add_document("Mars is a planet.") - faiss_store.add_document("The moon orbits the Earth.") - - # Since our embeddings are random (for the mock) or not guaranteed to be close, - # we just check that the search returns the correct number of results. - query_text = "What is a star?" - k = 2 - - search_results = faiss_store.search_similar_documents(query_text, k=k) - - assert len(search_results) == k - assert isinstance(search_results[0], int) diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/utils.py b/ai-hub/app/core/vector_store/utils.py new file mode 100644 index 0000000..af2bc48 --- /dev/null +++ b/ai-hub/app/core/vector_store/utils.py @@ -0,0 +1,11 @@ +import faiss +import os + +def save_faiss_index(index, path: str): + if index: + faiss.write_index(index, path) + +def load_faiss_index(path: str): + if os.path.exists(path): + return faiss.read_index(path) + return None diff --git a/ai-hub/tests/core/test_services.py b/ai-hub/tests/core/test_services.py index a5d1191..d36c203 100644 --- a/ai-hub/tests/core/test_services.py +++ b/ai-hub/tests/core/test_services.py @@ -10,7 +10,8 @@ # Import the service and its dependencies from app.core.services import RAGService from app.db import models -from app.core.vector_store import FaissVectorStore, MockEmbedder +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder # Import FaissDBRetriever and a mock WebRetriever for testing different cases from app.core.retrievers import FaissDBRetriever, Retriever from app.core.pipelines.dspy_rag import DspyRagPipeline, DSPyLLMProvider @@ -227,7 +228,7 @@ @patch('app.db.models.VectorMetadata') @patch('app.db.models.Document') -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_success(mock_vector_store, mock_document_model, mock_vector_metadata_model): """ Test the RAGService.add_document method for a successful run. @@ -282,7 +283,7 @@ embedding_model="mock_embedder" # This now passes because the mock embedder is of type MockEmbedder ) -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_error_handling(mock_vector_store): """ Test the RAGService.add_document method's error handling. diff --git a/ai-hub/tests/core/test_vector_store.py b/ai-hub/tests/core/test_vector_store.py deleted file mode 100644 index 0af7e46..0000000 --- a/ai-hub/tests/core/test_vector_store.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import pytest -import numpy as np -import requests -import json -from unittest import mock -from unittest.mock import MagicMock - -from app.core.vector_store import FaissVectorStore, MockEmbedder, GenAIEmbedder, get_embedder_from_config -from app.config import EmbeddingProvider - -# Define a constant for the dimension to ensure consistency -TEST_DIMENSION = 768 - -# --- Fixtures --- - -@pytest.fixture -def temp_faiss_file(tmp_path): - """ - Provides a temporary file path for the FAISS index to ensure tests are isolated. - """ - test_dir = tmp_path / "faiss_test" - test_dir.mkdir() - return str(test_dir / "test_index.faiss") - -@pytest.fixture -def mock_embedder(): - """ - Creates a MockEmbedder instance with the correct dimension. - """ - return MockEmbedder(dimension=TEST_DIMENSION) - -@pytest.fixture -def mock_genai_embedder(): - """ - Mocks the GenAIEmbedder to avoid making real API calls. - It patches the synchronous requests.post call and returns a mock response. - """ - with mock.patch('requests.post') as mock_post: - # Configure the mock response object - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None # No exception on success - - # Define the JSON content that the mock response will return - embedding_data = np.random.rand(TEST_DIMENSION).tolist() - mock_response.json.return_value = { - "embedding": {"values": embedding_data} - } - mock_post.return_value = mock_response - - # Create an instance of the real GenAIEmbedder class, now with the dimension argument - embedder = GenAIEmbedder( - model_name="gemini-embedding-001", - api_key="mock_api_key_for_testing", - dimension=TEST_DIMENSION # FIX: Added the missing dimension argument - ) - yield embedder - -@pytest.fixture(params=[ - pytest.param('mock_embedder', id="MockEmbedder"), - pytest.param('mock_genai_embedder', id="GenAIEmbedder") -]) -def faiss_store(request, temp_faiss_file): - """ - Parametrized fixture to test FaissVectorStore with both embedders. - """ - embedder = request.getfixturevalue(request.param) - faiss_store_instance = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=embedder, - ) - yield faiss_store_instance - -# --- Test Cases --- - -def test_add_document(faiss_store: FaissVectorStore): - """ - Test the add_document method to ensure it adds a vector and saves the index. - """ - test_text = "This is a test document." - - # Assert that the index is initially empty - assert faiss_store.index.ntotal == 0 - - # Add a document and check the index size - faiss_id = faiss_store.add_document(test_text) - - assert faiss_store.index.ntotal == 1 - assert faiss_id == 0 - assert os.path.exists(faiss_store.index_file_path) - -def test_add_multiple_documents(faiss_store: FaissVectorStore): - """ - Test that multiple documents can be added and the index size grows correctly. - """ - docs = ["Doc 1", "Doc 2", "Doc 3"] - - assert faiss_store.index.ntotal == 0 - - faiss_ids = faiss_store.add_multiple_documents(docs) - - assert faiss_store.index.ntotal == 3 - assert len(faiss_ids) == 3 - assert faiss_ids == [0, 1, 2] - -def test_load_existing_index(temp_faiss_file, mock_embedder): - """ - Test that the store can load an existing index file from disk. - """ - # 1. Create a store and add a document to it - first_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - first_store.add_document("Document for persistence test.") - - # 2. Create a new store instance with the same file path - # This should load the existing index, not create a new one - second_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - - # 3. Assert that the second store has the data from the first - assert second_store.index.ntotal == 1 - assert second_store.doc_id_map == [0] - -def test_search_similar_documents(faiss_store: FaissVectorStore): - """ - Test search functionality with a mock and a real embedder, - verifying the format of the results. - """ - # Add documents to the store - faiss_store.add_document("The sun is a star.") - faiss_store.add_document("Mars is a planet.") - faiss_store.add_document("The moon orbits the Earth.") - - # Since our embeddings are random (for the mock) or not guaranteed to be close, - # we just check that the search returns the correct number of results. - query_text = "What is a star?" - k = 2 - - search_results = faiss_store.search_similar_documents(query_text, k=k) - - assert len(search_results) == k - assert isinstance(search_results[0], int) diff --git a/ai-hub/tests/core/vector_store/__init__.py b/ai-hub/tests/core/vector_store/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/ai-hub/tests/core/vector_store/__init__.py diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/utils.py b/ai-hub/app/core/vector_store/utils.py new file mode 100644 index 0000000..af2bc48 --- /dev/null +++ b/ai-hub/app/core/vector_store/utils.py @@ -0,0 +1,11 @@ +import faiss +import os + +def save_faiss_index(index, path: str): + if index: + faiss.write_index(index, path) + +def load_faiss_index(path: str): + if os.path.exists(path): + return faiss.read_index(path) + return None diff --git a/ai-hub/tests/core/test_services.py b/ai-hub/tests/core/test_services.py index a5d1191..d36c203 100644 --- a/ai-hub/tests/core/test_services.py +++ b/ai-hub/tests/core/test_services.py @@ -10,7 +10,8 @@ # Import the service and its dependencies from app.core.services import RAGService from app.db import models -from app.core.vector_store import FaissVectorStore, MockEmbedder +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder # Import FaissDBRetriever and a mock WebRetriever for testing different cases from app.core.retrievers import FaissDBRetriever, Retriever from app.core.pipelines.dspy_rag import DspyRagPipeline, DSPyLLMProvider @@ -227,7 +228,7 @@ @patch('app.db.models.VectorMetadata') @patch('app.db.models.Document') -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_success(mock_vector_store, mock_document_model, mock_vector_metadata_model): """ Test the RAGService.add_document method for a successful run. @@ -282,7 +283,7 @@ embedding_model="mock_embedder" # This now passes because the mock embedder is of type MockEmbedder ) -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_error_handling(mock_vector_store): """ Test the RAGService.add_document method's error handling. diff --git a/ai-hub/tests/core/test_vector_store.py b/ai-hub/tests/core/test_vector_store.py deleted file mode 100644 index 0af7e46..0000000 --- a/ai-hub/tests/core/test_vector_store.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import pytest -import numpy as np -import requests -import json -from unittest import mock -from unittest.mock import MagicMock - -from app.core.vector_store import FaissVectorStore, MockEmbedder, GenAIEmbedder, get_embedder_from_config -from app.config import EmbeddingProvider - -# Define a constant for the dimension to ensure consistency -TEST_DIMENSION = 768 - -# --- Fixtures --- - -@pytest.fixture -def temp_faiss_file(tmp_path): - """ - Provides a temporary file path for the FAISS index to ensure tests are isolated. - """ - test_dir = tmp_path / "faiss_test" - test_dir.mkdir() - return str(test_dir / "test_index.faiss") - -@pytest.fixture -def mock_embedder(): - """ - Creates a MockEmbedder instance with the correct dimension. - """ - return MockEmbedder(dimension=TEST_DIMENSION) - -@pytest.fixture -def mock_genai_embedder(): - """ - Mocks the GenAIEmbedder to avoid making real API calls. - It patches the synchronous requests.post call and returns a mock response. - """ - with mock.patch('requests.post') as mock_post: - # Configure the mock response object - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None # No exception on success - - # Define the JSON content that the mock response will return - embedding_data = np.random.rand(TEST_DIMENSION).tolist() - mock_response.json.return_value = { - "embedding": {"values": embedding_data} - } - mock_post.return_value = mock_response - - # Create an instance of the real GenAIEmbedder class, now with the dimension argument - embedder = GenAIEmbedder( - model_name="gemini-embedding-001", - api_key="mock_api_key_for_testing", - dimension=TEST_DIMENSION # FIX: Added the missing dimension argument - ) - yield embedder - -@pytest.fixture(params=[ - pytest.param('mock_embedder', id="MockEmbedder"), - pytest.param('mock_genai_embedder', id="GenAIEmbedder") -]) -def faiss_store(request, temp_faiss_file): - """ - Parametrized fixture to test FaissVectorStore with both embedders. - """ - embedder = request.getfixturevalue(request.param) - faiss_store_instance = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=embedder, - ) - yield faiss_store_instance - -# --- Test Cases --- - -def test_add_document(faiss_store: FaissVectorStore): - """ - Test the add_document method to ensure it adds a vector and saves the index. - """ - test_text = "This is a test document." - - # Assert that the index is initially empty - assert faiss_store.index.ntotal == 0 - - # Add a document and check the index size - faiss_id = faiss_store.add_document(test_text) - - assert faiss_store.index.ntotal == 1 - assert faiss_id == 0 - assert os.path.exists(faiss_store.index_file_path) - -def test_add_multiple_documents(faiss_store: FaissVectorStore): - """ - Test that multiple documents can be added and the index size grows correctly. - """ - docs = ["Doc 1", "Doc 2", "Doc 3"] - - assert faiss_store.index.ntotal == 0 - - faiss_ids = faiss_store.add_multiple_documents(docs) - - assert faiss_store.index.ntotal == 3 - assert len(faiss_ids) == 3 - assert faiss_ids == [0, 1, 2] - -def test_load_existing_index(temp_faiss_file, mock_embedder): - """ - Test that the store can load an existing index file from disk. - """ - # 1. Create a store and add a document to it - first_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - first_store.add_document("Document for persistence test.") - - # 2. Create a new store instance with the same file path - # This should load the existing index, not create a new one - second_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - - # 3. Assert that the second store has the data from the first - assert second_store.index.ntotal == 1 - assert second_store.doc_id_map == [0] - -def test_search_similar_documents(faiss_store: FaissVectorStore): - """ - Test search functionality with a mock and a real embedder, - verifying the format of the results. - """ - # Add documents to the store - faiss_store.add_document("The sun is a star.") - faiss_store.add_document("Mars is a planet.") - faiss_store.add_document("The moon orbits the Earth.") - - # Since our embeddings are random (for the mock) or not guaranteed to be close, - # we just check that the search returns the correct number of results. - query_text = "What is a star?" - k = 2 - - search_results = faiss_store.search_similar_documents(query_text, k=k) - - assert len(search_results) == k - assert isinstance(search_results[0], int) diff --git a/ai-hub/tests/core/vector_store/__init__.py b/ai-hub/tests/core/vector_store/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/ai-hub/tests/core/vector_store/__init__.py diff --git a/ai-hub/tests/core/vector_store/conftest.py b/ai-hub/tests/core/vector_store/conftest.py new file mode 100644 index 0000000..a3fe41b --- /dev/null +++ b/ai-hub/tests/core/vector_store/conftest.py @@ -0,0 +1,39 @@ +import pytest +import numpy as np +from unittest import mock +from unittest.mock import MagicMock +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder +from app.core.vector_store.embedder.genai import GenAIEmbedder + +TEST_DIMENSION = 768 + +@pytest.fixture +def temp_faiss_file(tmp_path): + test_dir = tmp_path / "faiss_test" + test_dir.mkdir() + return str(test_dir / "test_index.faiss") + +@pytest.fixture +def mock_embedder(): + return MockEmbedder(dimension=TEST_DIMENSION) + +@pytest.fixture +def mock_genai_embedder(): + with mock.patch('requests.post') as mock_post: + embedding_data = np.random.rand(TEST_DIMENSION).tolist() + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = {"embedding": {"values": embedding_data}} + mock_post.return_value = mock_response + + yield GenAIEmbedder("gemini-embedding-001", "mock_api_key", TEST_DIMENSION) + +@pytest.fixture(params=[ + pytest.param('mock_embedder', id="MockEmbedder"), + pytest.param('mock_genai_embedder', id="GenAIEmbedder") +]) +def faiss_store(request, temp_faiss_file): + embedder = request.getfixturevalue(request.param) + store = FaissVectorStore(temp_faiss_file, TEST_DIMENSION, embedder) + yield store diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/utils.py b/ai-hub/app/core/vector_store/utils.py new file mode 100644 index 0000000..af2bc48 --- /dev/null +++ b/ai-hub/app/core/vector_store/utils.py @@ -0,0 +1,11 @@ +import faiss +import os + +def save_faiss_index(index, path: str): + if index: + faiss.write_index(index, path) + +def load_faiss_index(path: str): + if os.path.exists(path): + return faiss.read_index(path) + return None diff --git a/ai-hub/tests/core/test_services.py b/ai-hub/tests/core/test_services.py index a5d1191..d36c203 100644 --- a/ai-hub/tests/core/test_services.py +++ b/ai-hub/tests/core/test_services.py @@ -10,7 +10,8 @@ # Import the service and its dependencies from app.core.services import RAGService from app.db import models -from app.core.vector_store import FaissVectorStore, MockEmbedder +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder # Import FaissDBRetriever and a mock WebRetriever for testing different cases from app.core.retrievers import FaissDBRetriever, Retriever from app.core.pipelines.dspy_rag import DspyRagPipeline, DSPyLLMProvider @@ -227,7 +228,7 @@ @patch('app.db.models.VectorMetadata') @patch('app.db.models.Document') -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_success(mock_vector_store, mock_document_model, mock_vector_metadata_model): """ Test the RAGService.add_document method for a successful run. @@ -282,7 +283,7 @@ embedding_model="mock_embedder" # This now passes because the mock embedder is of type MockEmbedder ) -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_error_handling(mock_vector_store): """ Test the RAGService.add_document method's error handling. diff --git a/ai-hub/tests/core/test_vector_store.py b/ai-hub/tests/core/test_vector_store.py deleted file mode 100644 index 0af7e46..0000000 --- a/ai-hub/tests/core/test_vector_store.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import pytest -import numpy as np -import requests -import json -from unittest import mock -from unittest.mock import MagicMock - -from app.core.vector_store import FaissVectorStore, MockEmbedder, GenAIEmbedder, get_embedder_from_config -from app.config import EmbeddingProvider - -# Define a constant for the dimension to ensure consistency -TEST_DIMENSION = 768 - -# --- Fixtures --- - -@pytest.fixture -def temp_faiss_file(tmp_path): - """ - Provides a temporary file path for the FAISS index to ensure tests are isolated. - """ - test_dir = tmp_path / "faiss_test" - test_dir.mkdir() - return str(test_dir / "test_index.faiss") - -@pytest.fixture -def mock_embedder(): - """ - Creates a MockEmbedder instance with the correct dimension. - """ - return MockEmbedder(dimension=TEST_DIMENSION) - -@pytest.fixture -def mock_genai_embedder(): - """ - Mocks the GenAIEmbedder to avoid making real API calls. - It patches the synchronous requests.post call and returns a mock response. - """ - with mock.patch('requests.post') as mock_post: - # Configure the mock response object - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None # No exception on success - - # Define the JSON content that the mock response will return - embedding_data = np.random.rand(TEST_DIMENSION).tolist() - mock_response.json.return_value = { - "embedding": {"values": embedding_data} - } - mock_post.return_value = mock_response - - # Create an instance of the real GenAIEmbedder class, now with the dimension argument - embedder = GenAIEmbedder( - model_name="gemini-embedding-001", - api_key="mock_api_key_for_testing", - dimension=TEST_DIMENSION # FIX: Added the missing dimension argument - ) - yield embedder - -@pytest.fixture(params=[ - pytest.param('mock_embedder', id="MockEmbedder"), - pytest.param('mock_genai_embedder', id="GenAIEmbedder") -]) -def faiss_store(request, temp_faiss_file): - """ - Parametrized fixture to test FaissVectorStore with both embedders. - """ - embedder = request.getfixturevalue(request.param) - faiss_store_instance = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=embedder, - ) - yield faiss_store_instance - -# --- Test Cases --- - -def test_add_document(faiss_store: FaissVectorStore): - """ - Test the add_document method to ensure it adds a vector and saves the index. - """ - test_text = "This is a test document." - - # Assert that the index is initially empty - assert faiss_store.index.ntotal == 0 - - # Add a document and check the index size - faiss_id = faiss_store.add_document(test_text) - - assert faiss_store.index.ntotal == 1 - assert faiss_id == 0 - assert os.path.exists(faiss_store.index_file_path) - -def test_add_multiple_documents(faiss_store: FaissVectorStore): - """ - Test that multiple documents can be added and the index size grows correctly. - """ - docs = ["Doc 1", "Doc 2", "Doc 3"] - - assert faiss_store.index.ntotal == 0 - - faiss_ids = faiss_store.add_multiple_documents(docs) - - assert faiss_store.index.ntotal == 3 - assert len(faiss_ids) == 3 - assert faiss_ids == [0, 1, 2] - -def test_load_existing_index(temp_faiss_file, mock_embedder): - """ - Test that the store can load an existing index file from disk. - """ - # 1. Create a store and add a document to it - first_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - first_store.add_document("Document for persistence test.") - - # 2. Create a new store instance with the same file path - # This should load the existing index, not create a new one - second_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - - # 3. Assert that the second store has the data from the first - assert second_store.index.ntotal == 1 - assert second_store.doc_id_map == [0] - -def test_search_similar_documents(faiss_store: FaissVectorStore): - """ - Test search functionality with a mock and a real embedder, - verifying the format of the results. - """ - # Add documents to the store - faiss_store.add_document("The sun is a star.") - faiss_store.add_document("Mars is a planet.") - faiss_store.add_document("The moon orbits the Earth.") - - # Since our embeddings are random (for the mock) or not guaranteed to be close, - # we just check that the search returns the correct number of results. - query_text = "What is a star?" - k = 2 - - search_results = faiss_store.search_similar_documents(query_text, k=k) - - assert len(search_results) == k - assert isinstance(search_results[0], int) diff --git a/ai-hub/tests/core/vector_store/__init__.py b/ai-hub/tests/core/vector_store/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/ai-hub/tests/core/vector_store/__init__.py diff --git a/ai-hub/tests/core/vector_store/conftest.py b/ai-hub/tests/core/vector_store/conftest.py new file mode 100644 index 0000000..a3fe41b --- /dev/null +++ b/ai-hub/tests/core/vector_store/conftest.py @@ -0,0 +1,39 @@ +import pytest +import numpy as np +from unittest import mock +from unittest.mock import MagicMock +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder +from app.core.vector_store.embedder.genai import GenAIEmbedder + +TEST_DIMENSION = 768 + +@pytest.fixture +def temp_faiss_file(tmp_path): + test_dir = tmp_path / "faiss_test" + test_dir.mkdir() + return str(test_dir / "test_index.faiss") + +@pytest.fixture +def mock_embedder(): + return MockEmbedder(dimension=TEST_DIMENSION) + +@pytest.fixture +def mock_genai_embedder(): + with mock.patch('requests.post') as mock_post: + embedding_data = np.random.rand(TEST_DIMENSION).tolist() + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = {"embedding": {"values": embedding_data}} + mock_post.return_value = mock_response + + yield GenAIEmbedder("gemini-embedding-001", "mock_api_key", TEST_DIMENSION) + +@pytest.fixture(params=[ + pytest.param('mock_embedder', id="MockEmbedder"), + pytest.param('mock_genai_embedder', id="GenAIEmbedder") +]) +def faiss_store(request, temp_faiss_file): + embedder = request.getfixturevalue(request.param) + store = FaissVectorStore(temp_faiss_file, TEST_DIMENSION, embedder) + yield store diff --git a/ai-hub/tests/core/vector_store/test_embedder_factory.py b/ai-hub/tests/core/vector_store/test_embedder_factory.py new file mode 100644 index 0000000..7413376 --- /dev/null +++ b/ai-hub/tests/core/vector_store/test_embedder_factory.py @@ -0,0 +1,20 @@ +import pytest +from app.core.vector_store.embedder.factory import get_embedder_from_config +from app.config import EmbeddingProvider + +def test_get_mock_embedder(): + embedder = get_embedder_from_config(EmbeddingProvider.MOCK, 768, None, None) + assert embedder is not None + assert embedder.dimension == 768 + +def test_get_genai_embedder(): + embedder = get_embedder_from_config( + EmbeddingProvider.GOOGLE_GENAI, 768, "gemini-embedding-001", "fake_key" + ) + assert embedder.model_name == "gemini-embedding-001" + assert embedder.api_key == "fake_key" + assert embedder.dimension == 768 + +def test_get_embedder_raises_for_invalid_provider(): + with pytest.raises(ValueError): + get_embedder_from_config("invalid", 768, None, None) diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/utils.py b/ai-hub/app/core/vector_store/utils.py new file mode 100644 index 0000000..af2bc48 --- /dev/null +++ b/ai-hub/app/core/vector_store/utils.py @@ -0,0 +1,11 @@ +import faiss +import os + +def save_faiss_index(index, path: str): + if index: + faiss.write_index(index, path) + +def load_faiss_index(path: str): + if os.path.exists(path): + return faiss.read_index(path) + return None diff --git a/ai-hub/tests/core/test_services.py b/ai-hub/tests/core/test_services.py index a5d1191..d36c203 100644 --- a/ai-hub/tests/core/test_services.py +++ b/ai-hub/tests/core/test_services.py @@ -10,7 +10,8 @@ # Import the service and its dependencies from app.core.services import RAGService from app.db import models -from app.core.vector_store import FaissVectorStore, MockEmbedder +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder # Import FaissDBRetriever and a mock WebRetriever for testing different cases from app.core.retrievers import FaissDBRetriever, Retriever from app.core.pipelines.dspy_rag import DspyRagPipeline, DSPyLLMProvider @@ -227,7 +228,7 @@ @patch('app.db.models.VectorMetadata') @patch('app.db.models.Document') -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_success(mock_vector_store, mock_document_model, mock_vector_metadata_model): """ Test the RAGService.add_document method for a successful run. @@ -282,7 +283,7 @@ embedding_model="mock_embedder" # This now passes because the mock embedder is of type MockEmbedder ) -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_error_handling(mock_vector_store): """ Test the RAGService.add_document method's error handling. diff --git a/ai-hub/tests/core/test_vector_store.py b/ai-hub/tests/core/test_vector_store.py deleted file mode 100644 index 0af7e46..0000000 --- a/ai-hub/tests/core/test_vector_store.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import pytest -import numpy as np -import requests -import json -from unittest import mock -from unittest.mock import MagicMock - -from app.core.vector_store import FaissVectorStore, MockEmbedder, GenAIEmbedder, get_embedder_from_config -from app.config import EmbeddingProvider - -# Define a constant for the dimension to ensure consistency -TEST_DIMENSION = 768 - -# --- Fixtures --- - -@pytest.fixture -def temp_faiss_file(tmp_path): - """ - Provides a temporary file path for the FAISS index to ensure tests are isolated. - """ - test_dir = tmp_path / "faiss_test" - test_dir.mkdir() - return str(test_dir / "test_index.faiss") - -@pytest.fixture -def mock_embedder(): - """ - Creates a MockEmbedder instance with the correct dimension. - """ - return MockEmbedder(dimension=TEST_DIMENSION) - -@pytest.fixture -def mock_genai_embedder(): - """ - Mocks the GenAIEmbedder to avoid making real API calls. - It patches the synchronous requests.post call and returns a mock response. - """ - with mock.patch('requests.post') as mock_post: - # Configure the mock response object - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None # No exception on success - - # Define the JSON content that the mock response will return - embedding_data = np.random.rand(TEST_DIMENSION).tolist() - mock_response.json.return_value = { - "embedding": {"values": embedding_data} - } - mock_post.return_value = mock_response - - # Create an instance of the real GenAIEmbedder class, now with the dimension argument - embedder = GenAIEmbedder( - model_name="gemini-embedding-001", - api_key="mock_api_key_for_testing", - dimension=TEST_DIMENSION # FIX: Added the missing dimension argument - ) - yield embedder - -@pytest.fixture(params=[ - pytest.param('mock_embedder', id="MockEmbedder"), - pytest.param('mock_genai_embedder', id="GenAIEmbedder") -]) -def faiss_store(request, temp_faiss_file): - """ - Parametrized fixture to test FaissVectorStore with both embedders. - """ - embedder = request.getfixturevalue(request.param) - faiss_store_instance = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=embedder, - ) - yield faiss_store_instance - -# --- Test Cases --- - -def test_add_document(faiss_store: FaissVectorStore): - """ - Test the add_document method to ensure it adds a vector and saves the index. - """ - test_text = "This is a test document." - - # Assert that the index is initially empty - assert faiss_store.index.ntotal == 0 - - # Add a document and check the index size - faiss_id = faiss_store.add_document(test_text) - - assert faiss_store.index.ntotal == 1 - assert faiss_id == 0 - assert os.path.exists(faiss_store.index_file_path) - -def test_add_multiple_documents(faiss_store: FaissVectorStore): - """ - Test that multiple documents can be added and the index size grows correctly. - """ - docs = ["Doc 1", "Doc 2", "Doc 3"] - - assert faiss_store.index.ntotal == 0 - - faiss_ids = faiss_store.add_multiple_documents(docs) - - assert faiss_store.index.ntotal == 3 - assert len(faiss_ids) == 3 - assert faiss_ids == [0, 1, 2] - -def test_load_existing_index(temp_faiss_file, mock_embedder): - """ - Test that the store can load an existing index file from disk. - """ - # 1. Create a store and add a document to it - first_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - first_store.add_document("Document for persistence test.") - - # 2. Create a new store instance with the same file path - # This should load the existing index, not create a new one - second_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - - # 3. Assert that the second store has the data from the first - assert second_store.index.ntotal == 1 - assert second_store.doc_id_map == [0] - -def test_search_similar_documents(faiss_store: FaissVectorStore): - """ - Test search functionality with a mock and a real embedder, - verifying the format of the results. - """ - # Add documents to the store - faiss_store.add_document("The sun is a star.") - faiss_store.add_document("Mars is a planet.") - faiss_store.add_document("The moon orbits the Earth.") - - # Since our embeddings are random (for the mock) or not guaranteed to be close, - # we just check that the search returns the correct number of results. - query_text = "What is a star?" - k = 2 - - search_results = faiss_store.search_similar_documents(query_text, k=k) - - assert len(search_results) == k - assert isinstance(search_results[0], int) diff --git a/ai-hub/tests/core/vector_store/__init__.py b/ai-hub/tests/core/vector_store/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/ai-hub/tests/core/vector_store/__init__.py diff --git a/ai-hub/tests/core/vector_store/conftest.py b/ai-hub/tests/core/vector_store/conftest.py new file mode 100644 index 0000000..a3fe41b --- /dev/null +++ b/ai-hub/tests/core/vector_store/conftest.py @@ -0,0 +1,39 @@ +import pytest +import numpy as np +from unittest import mock +from unittest.mock import MagicMock +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder +from app.core.vector_store.embedder.genai import GenAIEmbedder + +TEST_DIMENSION = 768 + +@pytest.fixture +def temp_faiss_file(tmp_path): + test_dir = tmp_path / "faiss_test" + test_dir.mkdir() + return str(test_dir / "test_index.faiss") + +@pytest.fixture +def mock_embedder(): + return MockEmbedder(dimension=TEST_DIMENSION) + +@pytest.fixture +def mock_genai_embedder(): + with mock.patch('requests.post') as mock_post: + embedding_data = np.random.rand(TEST_DIMENSION).tolist() + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = {"embedding": {"values": embedding_data}} + mock_post.return_value = mock_response + + yield GenAIEmbedder("gemini-embedding-001", "mock_api_key", TEST_DIMENSION) + +@pytest.fixture(params=[ + pytest.param('mock_embedder', id="MockEmbedder"), + pytest.param('mock_genai_embedder', id="GenAIEmbedder") +]) +def faiss_store(request, temp_faiss_file): + embedder = request.getfixturevalue(request.param) + store = FaissVectorStore(temp_faiss_file, TEST_DIMENSION, embedder) + yield store diff --git a/ai-hub/tests/core/vector_store/test_embedder_factory.py b/ai-hub/tests/core/vector_store/test_embedder_factory.py new file mode 100644 index 0000000..7413376 --- /dev/null +++ b/ai-hub/tests/core/vector_store/test_embedder_factory.py @@ -0,0 +1,20 @@ +import pytest +from app.core.vector_store.embedder.factory import get_embedder_from_config +from app.config import EmbeddingProvider + +def test_get_mock_embedder(): + embedder = get_embedder_from_config(EmbeddingProvider.MOCK, 768, None, None) + assert embedder is not None + assert embedder.dimension == 768 + +def test_get_genai_embedder(): + embedder = get_embedder_from_config( + EmbeddingProvider.GOOGLE_GENAI, 768, "gemini-embedding-001", "fake_key" + ) + assert embedder.model_name == "gemini-embedding-001" + assert embedder.api_key == "fake_key" + assert embedder.dimension == 768 + +def test_get_embedder_raises_for_invalid_provider(): + with pytest.raises(ValueError): + get_embedder_from_config("invalid", 768, None, None) diff --git a/ai-hub/tests/core/vector_store/test_faiss_store.py b/ai-hub/tests/core/vector_store/test_faiss_store.py new file mode 100644 index 0000000..73c440e --- /dev/null +++ b/ai-hub/tests/core/vector_store/test_faiss_store.py @@ -0,0 +1,35 @@ +import os +import pytest +from app.core.vector_store.faiss_store import FaissVectorStore + +def test_add_document(faiss_store: FaissVectorStore): + test_text = "This is a test document." + assert faiss_store.index.ntotal == 0 + faiss_id = faiss_store.add_document(test_text) + assert faiss_store.index.ntotal == 1 + assert faiss_id == 0 + assert os.path.exists(faiss_store.index_file_path) + +def test_add_multiple_documents(faiss_store: FaissVectorStore): + docs = ["Doc 1", "Doc 2", "Doc 3"] + assert faiss_store.index.ntotal == 0 + faiss_ids = faiss_store.add_multiple_documents(docs) + assert faiss_store.index.ntotal == 3 + assert faiss_ids == [0, 1, 2] + +def test_load_existing_index(temp_faiss_file, mock_embedder): + store1 = FaissVectorStore(temp_faiss_file, 768, mock_embedder) + store1.add_document("Persistence test.") + + store2 = FaissVectorStore(temp_faiss_file, 768, mock_embedder) + assert store2.index.ntotal == 1 + assert store2.doc_id_map == [0] + +def test_search_similar_documents(faiss_store: FaissVectorStore): + faiss_store.add_document("The sun is a star.") + faiss_store.add_document("Mars is a planet.") + faiss_store.add_document("The moon orbits the Earth.") + + results = faiss_store.search_similar_documents("What is a star?", k=2) + assert len(results) == 2 + assert isinstance(results[0], int) diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index dc5f6e6..df21ba3 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -4,7 +4,8 @@ # Import centralized settings and other components from app.config import settings -from app.core.vector_store import FaissVectorStore, get_embedder_from_config +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.factory import get_embedder_from_config from app.core.retrievers import FaissDBRetriever, Retriever from app.core.services import RAGService from app.db.session import create_db_and_tables diff --git a/ai-hub/app/core/retrievers.py b/ai-hub/app/core/retrievers.py index 8f20713..d2c158b 100644 --- a/ai-hub/app/core/retrievers.py +++ b/ai-hub/app/core/retrievers.py @@ -1,7 +1,7 @@ import abc from typing import List, Dict from sqlalchemy.orm import Session -from app.core.vector_store import FaissVectorStore +from app.core.vector_store.faiss_store import FaissVectorStore from app.db import models class Retriever(abc.ABC): diff --git a/ai-hub/app/core/services.py b/ai-hub/app/core/services.py index 40181c6..afa0e1f 100644 --- a/ai-hub/app/core/services.py +++ b/ai-hub/app/core/services.py @@ -4,8 +4,8 @@ from sqlalchemy.exc import SQLAlchemyError import dspy -from app.core.vector_store import FaissVectorStore -from app.core.vector_store import MockEmbedder # Assuming a MockEmbedder class exists +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder from app.db import models from app.core.retrievers import Retriever, FaissDBRetriever from app.core.llm_providers import get_llm_provider diff --git a/ai-hub/app/core/vector_store.py b/ai-hub/app/core/vector_store.py deleted file mode 100644 index c40acf5..0000000 --- a/ai-hub/app/core/vector_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import faiss -import numpy as np -import os -import requests -import json -import logging -from typing import List, Optional, Dict, Any -from app.config import EmbeddingProvider - -# --- Embedder Implementations --- - -class MockEmbedder: - """A mock embedder for testing purposes.""" - def __init__(self, dimension: int): - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates a mock embedding synchronously. - """ - logging.debug("Generating mock embedding...") - return np.random.rand(self.dimension).astype('float32').reshape(1, -1) - -class GenAIEmbedder: - """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" - def __init__(self, model_name: str, api_key: str, dimension: int): - self.model_name = model_name - self.api_key = api_key - self.dimension = dimension - - def embed_text(self, text: str) -> np.ndarray: - """ - Generates an embedding by making a direct synchronous HTTP POST request - to the Gemini Embedding API. - """ - logging.debug("Calling GenAI for embedding...") - if not self.api_key: - raise ValueError("API key not set for GenAIEmbedder.") - - # Construct the API endpoint URL - api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" - - # Build the request headers and payload - headers = { - 'Content-Type': 'application/json', - 'x-goog-api-key': self.api_key - } - payload = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - "output_dimensionality": self.dimension - } - - try: - # Use the synchronous 'requests' library - response = requests.post(api_url, headers=headers, data=json.dumps(payload)) - response.raise_for_status() # Raise an exception for bad status codes - - result = response.json() - - # The 'embedding' field in the JSON response contains a 'values' list. - if 'embedding' not in result or 'values' not in result['embedding']: - raise KeyError("API response is missing the 'embedding' or 'values' field.") - - # Extract the embedding values and convert to a numpy array - embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) - logging.debug("GenAI embedding successfully generated.") - return embedding - except requests.exceptions.RequestException as e: - logging.error(f"HTTP client error embedding text with GenAI: {e}") - raise - except Exception as e: - logging.error(f"Error embedding text with GenAI: {e}") - raise e - - -# --- Embedder Factory --- - -def get_embedder_from_config( - provider: EmbeddingProvider, - dimension: Optional[int], - model_name: Optional[str], - api_key: Optional[str] -): - """ - Factory function to create a synchronous embedder instance based on the configuration. - """ - if provider == EmbeddingProvider.GOOGLE_GENAI: - if not api_key: - raise ValueError("Google GenAI requires an API key to be set in the configuration.") - - logging.info(f"Using GenAIEmbedder with model: {model_name}") - return GenAIEmbedder(model_name=model_name, api_key=api_key,dimension=dimension) - elif provider == EmbeddingProvider.MOCK: - logging.info("Using MockEmbedder.") - return MockEmbedder(dimension=dimension) - else: - raise ValueError(f"Unsupported embedding provider: {provider}") - - -# --- Vector Store Core --- - -class VectorStore: - """An abstract base class for vector stores.""" - def add_document(self, text: str) -> int: - raise NotImplementedError - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - raise NotImplementedError - -class FaissVectorStore(VectorStore): - """ - An in-memory vector store using the FAISS library for efficient similarity search. - This implementation handles the persistence of the FAISS index to a file. - """ - def __init__(self, index_file_path: str, dimension: int, embedder): - """ - Initializes the FaissVectorStore. - """ - self.index_file_path = index_file_path - self.dimension = dimension - self.embedder = embedder - - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) - self.doc_id_map = list(range(self.index.ntotal)) - else: - logging.info("Creating a new FAISS index.") - self.index = faiss.IndexFlatL2(dimension) - self.doc_id_map = [] - - def add_document(self, text: str) -> int: - """ - Embeds a document's text and adds the vector to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding document text for FAISS index...") - vector = self.embedder.embed_text(text) - vector = vector.reshape(1, -1) - self.index.add(vector) - - new_doc_id = self.index.ntotal - 1 - self.doc_id_map.append(new_doc_id) - - self.save_index() - logging.info(f"Document added to FAISS index with ID: {new_doc_id}") - - return new_doc_id - - def add_multiple_documents(self, texts: List[str]) -> List[int]: - """ - Embeds multiple documents' texts and adds the vectors to the FAISS index. - This is now a synchronous method. - """ - logging.debug("Embedding multiple document texts for FAISS index...") - # Embed each text synchronously - vectors = [self.embedder.embed_text(text) for text in texts] - - # Reshape the vectors to be suitable for FAISS - vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') - self.index.add(vectors) - - new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) - self.doc_id_map.extend(new_doc_ids) - self.save_index() - - logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") - return new_doc_ids - - def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: - """ - Embeds a query string and performs a similarity search in the FAISS index. - This is now a synchronous method. - """ - logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") - if self.index.ntotal == 0: - logging.warning("FAISS index is empty, no documents to search.") - return [] - - query_vector = self.embedder.embed_text(query_text) - query_vector = query_vector.reshape(1, -1) - - D, I = self.index.search(query_vector, k) - - result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] - logging.info(f"Search complete, found {len(result_ids)} similar documents.") - return result_ids - - def save_index(self): - """ - Saves the FAISS index to the specified file path. - """ - if self.index: - logging.info(f"Saving FAISS index to {self.index_file_path}") - faiss.write_index(self.index, self.index_file_path) - - def load_index(self): - """ - Loads a FAISS index from the specified file path. - """ - if os.path.exists(self.index_file_path): - logging.info(f"Loading FAISS index from {self.index_file_path}") - self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/__init__.py b/ai-hub/app/core/vector_store/__init__.py new file mode 100644 index 0000000..3fbb1fd --- /dev/null +++ b/ai-hub/app/core/vector_store/__init__.py @@ -0,0 +1 @@ +# This file can be left empty. diff --git a/ai-hub/app/core/vector_store/base.py b/ai-hub/app/core/vector_store/base.py new file mode 100644 index 0000000..e8fd31f --- /dev/null +++ b/ai-hub/app/core/vector_store/base.py @@ -0,0 +1,8 @@ +from typing import List + +class VectorStore: + def add_document(self, text: str) -> int: + raise NotImplementedError + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + raise NotImplementedError diff --git a/ai-hub/app/core/vector_store/embedder/factory.py b/ai-hub/app/core/vector_store/embedder/factory.py new file mode 100644 index 0000000..d958d3f --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/factory.py @@ -0,0 +1,11 @@ +from app.config import EmbeddingProvider +from .genai import GenAIEmbedder +from .mock import MockEmbedder + +def get_embedder_from_config(provider, dimension, model_name, api_key): + if provider == EmbeddingProvider.GOOGLE_GENAI: + return GenAIEmbedder(model_name, api_key, dimension) + elif provider == EmbeddingProvider.MOCK: + return MockEmbedder(dimension) + else: + raise ValueError(f"Unsupported embedding provider: {provider}") diff --git a/ai-hub/app/core/vector_store/embedder/genai.py b/ai-hub/app/core/vector_store/embedder/genai.py new file mode 100644 index 0000000..99935d1 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/genai.py @@ -0,0 +1,56 @@ +import json +import logging +import requests +import numpy as np + +class GenAIEmbedder: + """An embedder that uses the Google Generative AI service via direct synchronous HTTP.""" + def __init__(self, model_name: str, api_key: str, dimension: int): + self.model_name = model_name + self.api_key = api_key + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + """ + Generates an embedding by making a direct synchronous HTTP POST request + to the Gemini Embedding API. + """ + logging.debug("Calling GenAI for embedding...") + if not self.api_key: + raise ValueError("API key not set for GenAIEmbedder.") + + # Construct the API endpoint URL + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:embedContent" + + # Build the request headers and payload + headers = { + 'Content-Type': 'application/json', + 'x-goog-api-key': self.api_key + } + payload = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self.dimension + } + + try: + # Use the synchronous 'requests' library + response = requests.post(api_url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for bad status codes + + result = response.json() + + # The 'embedding' field in the JSON response contains a 'values' list. + if 'embedding' not in result or 'values' not in result['embedding']: + raise KeyError("API response is missing the 'embedding' or 'values' field.") + + # Extract the embedding values and convert to a numpy array + embedding = np.array(result["embedding"]["values"], dtype='float32').reshape(1, -1) + logging.debug("GenAI embedding successfully generated.") + return embedding + except requests.exceptions.RequestException as e: + logging.error(f"HTTP client error embedding text with GenAI: {e}") + raise + except Exception as e: + logging.error(f"Error embedding text with GenAI: {e}") + raise e diff --git a/ai-hub/app/core/vector_store/embedder/mock.py b/ai-hub/app/core/vector_store/embedder/mock.py new file mode 100644 index 0000000..0140e38 --- /dev/null +++ b/ai-hub/app/core/vector_store/embedder/mock.py @@ -0,0 +1,10 @@ +import numpy as np +import logging + +class MockEmbedder: + def __init__(self, dimension: int): + self.dimension = dimension + + def embed_text(self, text: str) -> np.ndarray: + logging.debug("Generating mock embedding...") + return np.random.rand(self.dimension).astype('float32').reshape(1, -1) diff --git a/ai-hub/app/core/vector_store/faiss_store.py b/ai-hub/app/core/vector_store/faiss_store.py new file mode 100644 index 0000000..5573c98 --- /dev/null +++ b/ai-hub/app/core/vector_store/faiss_store.py @@ -0,0 +1,102 @@ +import os +import logging +import faiss +import numpy as np +from .base import VectorStore +from .utils import save_faiss_index, load_faiss_index +from typing import List, Optional, Dict, Any + +class FaissVectorStore(VectorStore): + """ + An in-memory vector store using the FAISS library for efficient similarity search. + This implementation handles the persistence of the FAISS index to a file. + """ + def __init__(self, index_file_path: str, dimension: int, embedder): + """ + Initializes the FaissVectorStore. + """ + self.index_file_path = index_file_path + self.dimension = dimension + self.embedder = embedder + + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) + self.doc_id_map = list(range(self.index.ntotal)) + else: + logging.info("Creating a new FAISS index.") + self.index = faiss.IndexFlatL2(dimension) + self.doc_id_map = [] + + def add_document(self, text: str) -> int: + """ + Embeds a document's text and adds the vector to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding document text for FAISS index...") + vector = self.embedder.embed_text(text) + vector = vector.reshape(1, -1) + self.index.add(vector) + + new_doc_id = self.index.ntotal - 1 + self.doc_id_map.append(new_doc_id) + + self.save_index() + logging.info(f"Document added to FAISS index with ID: {new_doc_id}") + + return new_doc_id + + def add_multiple_documents(self, texts: List[str]) -> List[int]: + """ + Embeds multiple documents' texts and adds the vectors to the FAISS index. + This is now a synchronous method. + """ + logging.debug("Embedding multiple document texts for FAISS index...") + # Embed each text synchronously + vectors = [self.embedder.embed_text(text) for text in texts] + + # Reshape the vectors to be suitable for FAISS + vectors = np.vstack([v.reshape(1, -1) for v in vectors]).astype('float32') + self.index.add(vectors) + + new_doc_ids = list(range(self.index.ntotal - len(texts), self.index.ntotal)) + self.doc_id_map.extend(new_doc_ids) + self.save_index() + + logging.info(f"Added {len(new_doc_ids)} documents to FAISS index.") + return new_doc_ids + + def search_similar_documents(self, query_text: str, k: int = 5) -> List[int]: + """ + Embeds a query string and performs a similarity search in the FAISS index. + This is now a synchronous method. + """ + logging.debug(f"Searching FAISS index for similar documents to query: '{query_text[:50]}...'") + if self.index.ntotal == 0: + logging.warning("FAISS index is empty, no documents to search.") + return [] + + query_vector = self.embedder.embed_text(query_text) + query_vector = query_vector.reshape(1, -1) + + D, I = self.index.search(query_vector, k) + + result_ids = [self.doc_id_map[int(i)] for i in I.flatten() if i >= 0] + logging.info(f"Search complete, found {len(result_ids)} similar documents.") + return result_ids + + def save_index(self): + """ + Saves the FAISS index to the specified file path. + """ + if self.index: + logging.info(f"Saving FAISS index to {self.index_file_path}") + faiss.write_index(self.index, self.index_file_path) + + def load_index(self): + """ + Loads a FAISS index from the specified file path. + """ + if os.path.exists(self.index_file_path): + logging.info(f"Loading FAISS index from {self.index_file_path}") + self.index = faiss.read_index(self.index_file_path) diff --git a/ai-hub/app/core/vector_store/utils.py b/ai-hub/app/core/vector_store/utils.py new file mode 100644 index 0000000..af2bc48 --- /dev/null +++ b/ai-hub/app/core/vector_store/utils.py @@ -0,0 +1,11 @@ +import faiss +import os + +def save_faiss_index(index, path: str): + if index: + faiss.write_index(index, path) + +def load_faiss_index(path: str): + if os.path.exists(path): + return faiss.read_index(path) + return None diff --git a/ai-hub/tests/core/test_services.py b/ai-hub/tests/core/test_services.py index a5d1191..d36c203 100644 --- a/ai-hub/tests/core/test_services.py +++ b/ai-hub/tests/core/test_services.py @@ -10,7 +10,8 @@ # Import the service and its dependencies from app.core.services import RAGService from app.db import models -from app.core.vector_store import FaissVectorStore, MockEmbedder +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder # Import FaissDBRetriever and a mock WebRetriever for testing different cases from app.core.retrievers import FaissDBRetriever, Retriever from app.core.pipelines.dspy_rag import DspyRagPipeline, DSPyLLMProvider @@ -227,7 +228,7 @@ @patch('app.db.models.VectorMetadata') @patch('app.db.models.Document') -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_success(mock_vector_store, mock_document_model, mock_vector_metadata_model): """ Test the RAGService.add_document method for a successful run. @@ -282,7 +283,7 @@ embedding_model="mock_embedder" # This now passes because the mock embedder is of type MockEmbedder ) -@patch('app.core.vector_store.FaissVectorStore') +@patch('app.core.vector_store.faiss_store.FaissVectorStore') def test_rag_service_add_document_error_handling(mock_vector_store): """ Test the RAGService.add_document method's error handling. diff --git a/ai-hub/tests/core/test_vector_store.py b/ai-hub/tests/core/test_vector_store.py deleted file mode 100644 index 0af7e46..0000000 --- a/ai-hub/tests/core/test_vector_store.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import pytest -import numpy as np -import requests -import json -from unittest import mock -from unittest.mock import MagicMock - -from app.core.vector_store import FaissVectorStore, MockEmbedder, GenAIEmbedder, get_embedder_from_config -from app.config import EmbeddingProvider - -# Define a constant for the dimension to ensure consistency -TEST_DIMENSION = 768 - -# --- Fixtures --- - -@pytest.fixture -def temp_faiss_file(tmp_path): - """ - Provides a temporary file path for the FAISS index to ensure tests are isolated. - """ - test_dir = tmp_path / "faiss_test" - test_dir.mkdir() - return str(test_dir / "test_index.faiss") - -@pytest.fixture -def mock_embedder(): - """ - Creates a MockEmbedder instance with the correct dimension. - """ - return MockEmbedder(dimension=TEST_DIMENSION) - -@pytest.fixture -def mock_genai_embedder(): - """ - Mocks the GenAIEmbedder to avoid making real API calls. - It patches the synchronous requests.post call and returns a mock response. - """ - with mock.patch('requests.post') as mock_post: - # Configure the mock response object - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None # No exception on success - - # Define the JSON content that the mock response will return - embedding_data = np.random.rand(TEST_DIMENSION).tolist() - mock_response.json.return_value = { - "embedding": {"values": embedding_data} - } - mock_post.return_value = mock_response - - # Create an instance of the real GenAIEmbedder class, now with the dimension argument - embedder = GenAIEmbedder( - model_name="gemini-embedding-001", - api_key="mock_api_key_for_testing", - dimension=TEST_DIMENSION # FIX: Added the missing dimension argument - ) - yield embedder - -@pytest.fixture(params=[ - pytest.param('mock_embedder', id="MockEmbedder"), - pytest.param('mock_genai_embedder', id="GenAIEmbedder") -]) -def faiss_store(request, temp_faiss_file): - """ - Parametrized fixture to test FaissVectorStore with both embedders. - """ - embedder = request.getfixturevalue(request.param) - faiss_store_instance = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=embedder, - ) - yield faiss_store_instance - -# --- Test Cases --- - -def test_add_document(faiss_store: FaissVectorStore): - """ - Test the add_document method to ensure it adds a vector and saves the index. - """ - test_text = "This is a test document." - - # Assert that the index is initially empty - assert faiss_store.index.ntotal == 0 - - # Add a document and check the index size - faiss_id = faiss_store.add_document(test_text) - - assert faiss_store.index.ntotal == 1 - assert faiss_id == 0 - assert os.path.exists(faiss_store.index_file_path) - -def test_add_multiple_documents(faiss_store: FaissVectorStore): - """ - Test that multiple documents can be added and the index size grows correctly. - """ - docs = ["Doc 1", "Doc 2", "Doc 3"] - - assert faiss_store.index.ntotal == 0 - - faiss_ids = faiss_store.add_multiple_documents(docs) - - assert faiss_store.index.ntotal == 3 - assert len(faiss_ids) == 3 - assert faiss_ids == [0, 1, 2] - -def test_load_existing_index(temp_faiss_file, mock_embedder): - """ - Test that the store can load an existing index file from disk. - """ - # 1. Create a store and add a document to it - first_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - first_store.add_document("Document for persistence test.") - - # 2. Create a new store instance with the same file path - # This should load the existing index, not create a new one - second_store = FaissVectorStore( - index_file_path=temp_faiss_file, - dimension=TEST_DIMENSION, - embedder=mock_embedder, - ) - - # 3. Assert that the second store has the data from the first - assert second_store.index.ntotal == 1 - assert second_store.doc_id_map == [0] - -def test_search_similar_documents(faiss_store: FaissVectorStore): - """ - Test search functionality with a mock and a real embedder, - verifying the format of the results. - """ - # Add documents to the store - faiss_store.add_document("The sun is a star.") - faiss_store.add_document("Mars is a planet.") - faiss_store.add_document("The moon orbits the Earth.") - - # Since our embeddings are random (for the mock) or not guaranteed to be close, - # we just check that the search returns the correct number of results. - query_text = "What is a star?" - k = 2 - - search_results = faiss_store.search_similar_documents(query_text, k=k) - - assert len(search_results) == k - assert isinstance(search_results[0], int) diff --git a/ai-hub/tests/core/vector_store/__init__.py b/ai-hub/tests/core/vector_store/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/ai-hub/tests/core/vector_store/__init__.py diff --git a/ai-hub/tests/core/vector_store/conftest.py b/ai-hub/tests/core/vector_store/conftest.py new file mode 100644 index 0000000..a3fe41b --- /dev/null +++ b/ai-hub/tests/core/vector_store/conftest.py @@ -0,0 +1,39 @@ +import pytest +import numpy as np +from unittest import mock +from unittest.mock import MagicMock +from app.core.vector_store.faiss_store import FaissVectorStore +from app.core.vector_store.embedder.mock import MockEmbedder +from app.core.vector_store.embedder.genai import GenAIEmbedder + +TEST_DIMENSION = 768 + +@pytest.fixture +def temp_faiss_file(tmp_path): + test_dir = tmp_path / "faiss_test" + test_dir.mkdir() + return str(test_dir / "test_index.faiss") + +@pytest.fixture +def mock_embedder(): + return MockEmbedder(dimension=TEST_DIMENSION) + +@pytest.fixture +def mock_genai_embedder(): + with mock.patch('requests.post') as mock_post: + embedding_data = np.random.rand(TEST_DIMENSION).tolist() + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = {"embedding": {"values": embedding_data}} + mock_post.return_value = mock_response + + yield GenAIEmbedder("gemini-embedding-001", "mock_api_key", TEST_DIMENSION) + +@pytest.fixture(params=[ + pytest.param('mock_embedder', id="MockEmbedder"), + pytest.param('mock_genai_embedder', id="GenAIEmbedder") +]) +def faiss_store(request, temp_faiss_file): + embedder = request.getfixturevalue(request.param) + store = FaissVectorStore(temp_faiss_file, TEST_DIMENSION, embedder) + yield store diff --git a/ai-hub/tests/core/vector_store/test_embedder_factory.py b/ai-hub/tests/core/vector_store/test_embedder_factory.py new file mode 100644 index 0000000..7413376 --- /dev/null +++ b/ai-hub/tests/core/vector_store/test_embedder_factory.py @@ -0,0 +1,20 @@ +import pytest +from app.core.vector_store.embedder.factory import get_embedder_from_config +from app.config import EmbeddingProvider + +def test_get_mock_embedder(): + embedder = get_embedder_from_config(EmbeddingProvider.MOCK, 768, None, None) + assert embedder is not None + assert embedder.dimension == 768 + +def test_get_genai_embedder(): + embedder = get_embedder_from_config( + EmbeddingProvider.GOOGLE_GENAI, 768, "gemini-embedding-001", "fake_key" + ) + assert embedder.model_name == "gemini-embedding-001" + assert embedder.api_key == "fake_key" + assert embedder.dimension == 768 + +def test_get_embedder_raises_for_invalid_provider(): + with pytest.raises(ValueError): + get_embedder_from_config("invalid", 768, None, None) diff --git a/ai-hub/tests/core/vector_store/test_faiss_store.py b/ai-hub/tests/core/vector_store/test_faiss_store.py new file mode 100644 index 0000000..73c440e --- /dev/null +++ b/ai-hub/tests/core/vector_store/test_faiss_store.py @@ -0,0 +1,35 @@ +import os +import pytest +from app.core.vector_store.faiss_store import FaissVectorStore + +def test_add_document(faiss_store: FaissVectorStore): + test_text = "This is a test document." + assert faiss_store.index.ntotal == 0 + faiss_id = faiss_store.add_document(test_text) + assert faiss_store.index.ntotal == 1 + assert faiss_id == 0 + assert os.path.exists(faiss_store.index_file_path) + +def test_add_multiple_documents(faiss_store: FaissVectorStore): + docs = ["Doc 1", "Doc 2", "Doc 3"] + assert faiss_store.index.ntotal == 0 + faiss_ids = faiss_store.add_multiple_documents(docs) + assert faiss_store.index.ntotal == 3 + assert faiss_ids == [0, 1, 2] + +def test_load_existing_index(temp_faiss_file, mock_embedder): + store1 = FaissVectorStore(temp_faiss_file, 768, mock_embedder) + store1.add_document("Persistence test.") + + store2 = FaissVectorStore(temp_faiss_file, 768, mock_embedder) + assert store2.index.ntotal == 1 + assert store2.doc_id_map == [0] + +def test_search_similar_documents(faiss_store: FaissVectorStore): + faiss_store.add_document("The sun is a star.") + faiss_store.add_document("Mars is a planet.") + faiss_store.add_document("The moon orbits the Earth.") + + results = faiss_store.search_similar_documents("What is a star?", k=2) + assert len(results) == 2 + assert isinstance(results[0], int) diff --git a/ai-hub/tests/core/vector_store/test_mock_embedder.py b/ai-hub/tests/core/vector_store/test_mock_embedder.py new file mode 100644 index 0000000..828b652 --- /dev/null +++ b/ai-hub/tests/core/vector_store/test_mock_embedder.py @@ -0,0 +1,18 @@ +import numpy as np +from unittest import mock +from unittest.mock import MagicMock +from app.core.vector_store.embedder.genai import GenAIEmbedder + +def test_genai_embedder_returns_correct_shape(): + with mock.patch('requests.post') as mock_post: + embedding_data = np.random.rand(768).tolist() + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = {"embedding": {"values": embedding_data}} + mock_post.return_value = mock_response + + embedder = GenAIEmbedder("gemini-embedding-001", "fake_key", 768) + result = embedder.embed_text("Hello world") + + assert result.shape == (1, 768) + assert result.dtype == np.float32