Arnav Agrawal 821e9d7e20
Add support for ColPali (#43)
* debug mps not supported

* further debug (i think i lost some braincells)

* fix mps bug and resolve dependency issues

* remove libmagic dependence

* add colpali embedding model

* multi-vector store works - verified with testing

* add integration testing

* support text embedding in colpali

* complete colplai integration and testing

* formatting + some PR comments

* remove experimental files

* resolve PR comments
2025-02-26 20:17:12 -05:00

41 lines
1.1 KiB
Python

from typing import Any, Dict, List
from pydantic import BaseModel, Field
import numpy as np
Embedding = List[float] | List[List[float]] | np.ndarray
class DocumentChunk(BaseModel):
"""Represents a chunk stored in VectorStore"""
document_id: str # external_id of parent document
content: str
embedding: Embedding
chunk_number: int
# chunk-specific metadata
metadata: Dict[str, Any] = Field(default_factory=dict)
score: float = 0.0
model_config = {"arbitrary_types_allowed": True}
class Chunk(BaseModel):
"""Represents a chunk containing content and metadata"""
content: str
metadata: Dict[str, Any] = Field(default_factory=dict)
model_config = {"arbitrary_types_allowed": True}
def to_document_chunk(
self, document_id: str, chunk_number: int, embedding: Embedding
) -> DocumentChunk:
return DocumentChunk(
document_id=document_id,
content=self.content,
embedding=embedding,
chunk_number=chunk_number,
metadata=self.metadata,
)