mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
Add batch ingestion (#55)
This commit is contained in:
parent
5d76521059
commit
adc0b2dbb8
117
core/api.py
117
core/api.py
@ -1,16 +1,17 @@
|
|||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from datetime import datetime, UTC, timedelta
|
from datetime import datetime, UTC, timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
from fastapi import FastAPI, Form, HTTPException, Depends, Header, UploadFile
|
from fastapi import FastAPI, Form, HTTPException, Depends, Header, UploadFile, File
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
import jwt
|
import jwt
|
||||||
import logging
|
import logging
|
||||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||||
from core.completion.openai_completion import OpenAICompletionModel
|
from core.completion.openai_completion import OpenAICompletionModel
|
||||||
from core.embedding.ollama_embedding_model import OllamaEmbeddingModel
|
from core.embedding.ollama_embedding_model import OllamaEmbeddingModel
|
||||||
from core.models.request import RetrieveRequest, CompletionQueryRequest, IngestTextRequest, CreateGraphRequest
|
from core.models.request import RetrieveRequest, CompletionQueryRequest, IngestTextRequest, CreateGraphRequest, BatchIngestResponse
|
||||||
from core.models.completion import ChunkSource, CompletionResponse
|
from core.models.completion import ChunkSource, CompletionResponse
|
||||||
from core.models.documents import Document, DocumentResult, ChunkResult
|
from core.models.documents import Document, DocumentResult, ChunkResult
|
||||||
from core.models.graph import Graph
|
from core.models.graph import Graph
|
||||||
@ -360,6 +361,118 @@ async def ingest_file(
|
|||||||
raise HTTPException(status_code=403, detail=str(e))
|
raise HTTPException(status_code=403, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ingest/files", response_model=BatchIngestResponse)
|
||||||
|
async def batch_ingest_files(
|
||||||
|
files: List[UploadFile] = File(...),
|
||||||
|
metadata: str = Form("{}"),
|
||||||
|
rules: str = Form("[]"),
|
||||||
|
use_colpali: Optional[bool] = Form(None),
|
||||||
|
parallel: bool = Form(True),
|
||||||
|
auth: AuthContext = Depends(verify_token),
|
||||||
|
) -> BatchIngestResponse:
|
||||||
|
"""
|
||||||
|
Batch ingest multiple files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: List of files to ingest
|
||||||
|
metadata: JSON string of metadata (either a single dict or list of dicts)
|
||||||
|
rules: JSON string of rules list. Can be either:
|
||||||
|
- A single list of rules to apply to all files
|
||||||
|
- A list of rule lists, one per file
|
||||||
|
use_colpali: Whether to use ColPali-style embedding
|
||||||
|
parallel: Whether to process files in parallel
|
||||||
|
auth: Authentication context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BatchIngestResponse containing:
|
||||||
|
- documents: List of successfully ingested documents
|
||||||
|
- errors: List of errors encountered during ingestion
|
||||||
|
"""
|
||||||
|
if not files:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="No files provided for batch ingestion"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
metadata_value = json.loads(metadata)
|
||||||
|
rules_list = json.loads(rules)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}")
|
||||||
|
|
||||||
|
# Validate metadata if it's a list
|
||||||
|
if isinstance(metadata_value, list) and len(metadata_value) != len(files):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Number of metadata items ({len(metadata_value)}) must match number of files ({len(files)})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate rules if it's a list of lists
|
||||||
|
if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list):
|
||||||
|
if len(rules_list) != len(files):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Number of rule lists ({len(rules_list)}) must match number of files ({len(files)})"
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = []
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
async with telemetry.track_operation(
|
||||||
|
operation_type="batch_ingest",
|
||||||
|
user_id=auth.entity_id,
|
||||||
|
metadata={
|
||||||
|
"file_count": len(files),
|
||||||
|
"metadata_type": "list" if isinstance(metadata_value, list) else "single",
|
||||||
|
"rules_type": "per_file" if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list) else "shared",
|
||||||
|
},
|
||||||
|
):
|
||||||
|
if parallel:
|
||||||
|
tasks = []
|
||||||
|
for i, file in enumerate(files):
|
||||||
|
metadata_item = metadata_value[i] if isinstance(metadata_value, list) else metadata_value
|
||||||
|
file_rules = rules_list[i] if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list) else rules_list
|
||||||
|
task = document_service.ingest_file(
|
||||||
|
file=file,
|
||||||
|
metadata=metadata_item,
|
||||||
|
auth=auth,
|
||||||
|
rules=file_rules,
|
||||||
|
use_colpali=use_colpali
|
||||||
|
)
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
for i, result in enumerate(results):
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
errors.append({
|
||||||
|
"filename": files[i].filename,
|
||||||
|
"error": str(result)
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
documents.append(result)
|
||||||
|
else:
|
||||||
|
for i, file in enumerate(files):
|
||||||
|
try:
|
||||||
|
metadata_item = metadata_value[i] if isinstance(metadata_value, list) else metadata_value
|
||||||
|
file_rules = rules_list[i] if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list) else rules_list
|
||||||
|
doc = await document_service.ingest_file(
|
||||||
|
file=file,
|
||||||
|
metadata=metadata_item,
|
||||||
|
auth=auth,
|
||||||
|
rules=file_rules,
|
||||||
|
use_colpali=use_colpali
|
||||||
|
)
|
||||||
|
documents.append(doc)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append({
|
||||||
|
"filename": file.filename,
|
||||||
|
"error": str(e)
|
||||||
|
})
|
||||||
|
|
||||||
|
return BatchIngestResponse(documents=documents, errors=errors)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/retrieve/chunks", response_model=List[ChunkResult])
|
@app.post("/retrieve/chunks", response_model=List[ChunkResult])
|
||||||
async def retrieve_chunks(request: RetrieveRequest, auth: AuthContext = Depends(verify_token)):
|
async def retrieve_chunks(request: RetrieveRequest, auth: AuthContext = Depends(verify_token)):
|
||||||
"""Retrieve relevant chunks."""
|
"""Retrieve relevant chunks."""
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
from typing import Dict, Any, Optional, List
|
from typing import Dict, List, Optional, Any
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from core.models.documents import Document
|
||||||
|
|
||||||
|
|
||||||
class RetrieveRequest(BaseModel):
|
class RetrieveRequest(BaseModel):
|
||||||
"""Base retrieve request model"""
|
"""Base retrieve request model"""
|
||||||
@ -49,3 +51,9 @@ class CreateGraphRequest(BaseModel):
|
|||||||
documents: Optional[List[str]] = Field(
|
documents: Optional[List[str]] = Field(
|
||||||
None, description="Optional list of specific document IDs to include"
|
None, description="Optional list of specific document IDs to include"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BatchIngestResponse(BaseModel):
|
||||||
|
"""Response model for batch ingestion"""
|
||||||
|
documents: List[Document]
|
||||||
|
errors: List[Dict[str, str]]
|
||||||
|
@ -405,6 +405,24 @@ class DocumentService:
|
|||||||
# Read file content
|
# Read file content
|
||||||
file_content = await file.read()
|
file_content = await file.read()
|
||||||
file_type = filetype.guess(file_content)
|
file_type = filetype.guess(file_content)
|
||||||
|
|
||||||
|
# Set default mime type for cases where filetype.guess returns None
|
||||||
|
mime_type = ""
|
||||||
|
if file_type is not None:
|
||||||
|
mime_type = file_type.mime
|
||||||
|
elif file.filename:
|
||||||
|
# Try to determine by file extension as fallback
|
||||||
|
import mimetypes
|
||||||
|
guessed_type = mimetypes.guess_type(file.filename)[0]
|
||||||
|
if guessed_type:
|
||||||
|
mime_type = guessed_type
|
||||||
|
else:
|
||||||
|
# Default for text files
|
||||||
|
mime_type = "text/plain"
|
||||||
|
else:
|
||||||
|
mime_type = "application/octet-stream" # Generic binary data
|
||||||
|
|
||||||
|
logger.info(f"Determined MIME type: {mime_type} for file {file.filename}")
|
||||||
|
|
||||||
# Parse file to text first
|
# Parse file to text first
|
||||||
additional_metadata, text = await self.parser.parse_file_to_text(
|
additional_metadata, text = await self.parser.parse_file_to_text(
|
||||||
@ -423,7 +441,7 @@ class DocumentService:
|
|||||||
|
|
||||||
# Create document record
|
# Create document record
|
||||||
doc = Document(
|
doc = Document(
|
||||||
content_type=file_type.mime or "",
|
content_type=mime_type,
|
||||||
filename=file.filename,
|
filename=file.filename,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
owner={"type": auth.entity_type, "id": auth.entity_id},
|
owner={"type": auth.entity_type, "id": auth.entity_id},
|
||||||
@ -495,8 +513,19 @@ class DocumentService:
|
|||||||
def _create_chunks_multivector(
|
def _create_chunks_multivector(
|
||||||
self, file_type, file_content_base64: str, file_content: bytes, chunks: List[Chunk]
|
self, file_type, file_content_base64: str, file_content: bytes, chunks: List[Chunk]
|
||||||
):
|
):
|
||||||
logger.info(f"Creating chunks for multivector embedding for file type {file_type.mime}")
|
# Handle the case where file_type is None
|
||||||
match file_type.mime:
|
mime_type = file_type.mime if file_type is not None else "text/plain"
|
||||||
|
logger.info(f"Creating chunks for multivector embedding for file type {mime_type}")
|
||||||
|
|
||||||
|
# If file_type is None, treat it as a text file
|
||||||
|
if file_type is None:
|
||||||
|
logger.info("File type is None, treating as text")
|
||||||
|
return [
|
||||||
|
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
|
||||||
|
for chunk in chunks
|
||||||
|
]
|
||||||
|
|
||||||
|
match mime_type:
|
||||||
case file_type if file_type in IMAGE:
|
case file_type if file_type in IMAGE:
|
||||||
return [Chunk(content=file_content_base64, metadata={"is_image": True})]
|
return [Chunk(content=file_content_base64, metadata={"is_image": True})]
|
||||||
case "application/pdf":
|
case "application/pdf":
|
||||||
@ -1017,6 +1046,15 @@ class DocumentService:
|
|||||||
file_type = filetype.guess(file_content)
|
file_type = filetype.guess(file_content)
|
||||||
if file_type:
|
if file_type:
|
||||||
doc.content_type = file_type.mime
|
doc.content_type = file_type.mime
|
||||||
|
else:
|
||||||
|
# If filetype.guess failed, try to determine from filename
|
||||||
|
import mimetypes
|
||||||
|
guessed_type = mimetypes.guess_type(file.filename)[0]
|
||||||
|
if guessed_type:
|
||||||
|
doc.content_type = guessed_type
|
||||||
|
else:
|
||||||
|
# Default fallback
|
||||||
|
doc.content_type = "text/plain" if file.filename.endswith('.txt') else "application/octet-stream"
|
||||||
|
|
||||||
# Update filename
|
# Update filename
|
||||||
doc.filename = file.filename
|
doc.filename = file.filename
|
||||||
|
@ -8,7 +8,7 @@ from typing import AsyncGenerator, Dict
|
|||||||
from httpx import AsyncClient
|
from httpx import AsyncClient
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from httpx import ASGITransport
|
from httpx import ASGITransport
|
||||||
from core.api import app, get_settings
|
from core.api import get_settings
|
||||||
import filetype
|
import filetype
|
||||||
import logging
|
import logging
|
||||||
from sqlalchemy.ext.asyncio import create_async_engine
|
from sqlalchemy.ext.asyncio import create_async_engine
|
||||||
@ -1426,6 +1426,303 @@ async def test_query_with_graph(client: AsyncClient):
|
|||||||
assert response_no_graph.status_code == 200
|
assert response_no_graph.status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_batch_ingest_with_shared_metadata(
|
||||||
|
client: AsyncClient
|
||||||
|
):
|
||||||
|
"""Test batch ingestion with shared metadata for all files."""
|
||||||
|
headers = create_auth_header()
|
||||||
|
# Create test files
|
||||||
|
files = [
|
||||||
|
("files", ("test1.txt", b"Test content 1")),
|
||||||
|
("files", ("test2.txt", b"Test content 2")),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Shared metadata for all files
|
||||||
|
metadata = {"category": "test", "batch": "shared"}
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps(metadata),
|
||||||
|
"rules": json.dumps([]),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "true",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert len(result["documents"]) == 2
|
||||||
|
assert len(result["errors"]) == 0
|
||||||
|
|
||||||
|
# Verify all documents got the same metadata
|
||||||
|
for doc in result["documents"]:
|
||||||
|
assert doc["metadata"]["category"] == "test"
|
||||||
|
assert doc["metadata"]["batch"] == "shared"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_batch_ingest_with_individual_metadata(
|
||||||
|
client: AsyncClient
|
||||||
|
):
|
||||||
|
"""Test batch ingestion with individual metadata per file."""
|
||||||
|
headers = create_auth_header()
|
||||||
|
# Create test files
|
||||||
|
files = [
|
||||||
|
("files", ("test1.txt", b"Test content 1")),
|
||||||
|
("files", ("test2.txt", b"Test content 2")),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Individual metadata
|
||||||
|
metadata = [
|
||||||
|
{"category": "test1", "batch": "individual"},
|
||||||
|
{"category": "test2", "batch": "individual"},
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps(metadata),
|
||||||
|
"rules": json.dumps([]),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "true",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert len(result["documents"]) == 2
|
||||||
|
assert len(result["errors"]) == 0
|
||||||
|
|
||||||
|
# Verify each document got its correct metadata
|
||||||
|
assert result["documents"][0]["metadata"]["category"] == "test1"
|
||||||
|
assert result["documents"][1]["metadata"]["category"] == "test2"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_batch_ingest_metadata_validation(
|
||||||
|
client: AsyncClient
|
||||||
|
):
|
||||||
|
"""Test validation when metadata list length doesn't match files."""
|
||||||
|
headers = create_auth_header()
|
||||||
|
files = [
|
||||||
|
("files", ("test1.txt", b"Test content 1")),
|
||||||
|
("files", ("test2.txt", b"Test content 2")),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Metadata list with wrong length
|
||||||
|
metadata = [
|
||||||
|
{"category": "test1"},
|
||||||
|
{"category": "test2"},
|
||||||
|
{"category": "test3"}, # Extra metadata
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps(metadata),
|
||||||
|
"rules": json.dumps([]),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "true",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 400
|
||||||
|
assert "must match number of files" in response.json()["detail"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_batch_ingest_sequential(
|
||||||
|
client: AsyncClient
|
||||||
|
):
|
||||||
|
"""Test sequential batch ingestion."""
|
||||||
|
headers = create_auth_header()
|
||||||
|
files = [
|
||||||
|
("files", ("test1.txt", b"Test content 1")),
|
||||||
|
("files", ("test2.txt", b"Test content 2")),
|
||||||
|
]
|
||||||
|
|
||||||
|
metadata = {"category": "test"}
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps(metadata),
|
||||||
|
"rules": json.dumps([]),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "false", # Process sequentially
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert len(result["documents"]) == 2
|
||||||
|
assert len(result["errors"]) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_batch_ingest_with_rules(
|
||||||
|
client: AsyncClient
|
||||||
|
):
|
||||||
|
"""Test batch ingestion with rules applied."""
|
||||||
|
headers = create_auth_header()
|
||||||
|
files = [
|
||||||
|
("files", ("test1.txt", b"Test content 1")),
|
||||||
|
("files", ("test2.txt", b"Test content 2")),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Test shared rules for all files
|
||||||
|
shared_rules = [{"type": "natural_language", "prompt": "Extract keywords"}]
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps({}),
|
||||||
|
"rules": json.dumps(shared_rules),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "true",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert len(result["documents"]) == 2
|
||||||
|
assert len(result["errors"]) == 0
|
||||||
|
|
||||||
|
# Test per-file rules
|
||||||
|
per_file_rules = [
|
||||||
|
[{"type": "natural_language", "prompt": "Extract keywords"}], # Rules for first file
|
||||||
|
[{"type": "metadata_extraction", "schema": {"title": "string"}}], # Rules for second file
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps({}),
|
||||||
|
"rules": json.dumps(per_file_rules),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "true",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert len(result["documents"]) == 2
|
||||||
|
assert len(result["errors"]) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_batch_ingest_rules_validation(
|
||||||
|
client: AsyncClient
|
||||||
|
):
|
||||||
|
"""Test validation of rules format and length."""
|
||||||
|
headers = create_auth_header()
|
||||||
|
files = [
|
||||||
|
("files", ("test1.txt", b"Test content 1")),
|
||||||
|
("files", ("test2.txt", b"Test content 2")),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Test invalid rules format
|
||||||
|
invalid_rules = "not a list"
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps({}),
|
||||||
|
"rules": invalid_rules,
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "true",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 400
|
||||||
|
assert "Invalid JSON" in response.json()["detail"]
|
||||||
|
|
||||||
|
# Test per-file rules with wrong length
|
||||||
|
per_file_rules = [
|
||||||
|
[{"type": "natural_language", "prompt": "Extract keywords"}], # Only one set of rules
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps({}),
|
||||||
|
"rules": json.dumps(per_file_rules),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "true",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 400
|
||||||
|
assert "must match number of files" in response.json()["detail"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_batch_ingest_sequential_vs_parallel(
|
||||||
|
client: AsyncClient
|
||||||
|
):
|
||||||
|
"""Test both sequential and parallel batch ingestion."""
|
||||||
|
headers = create_auth_header()
|
||||||
|
files = [
|
||||||
|
("files", ("test1.txt", b"Test content 1")),
|
||||||
|
("files", ("test2.txt", b"Test content 2")),
|
||||||
|
("files", ("test3.txt", b"Test content 3")),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Test parallel processing
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps({}),
|
||||||
|
"rules": json.dumps([]),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "true",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert len(result["documents"]) == 3
|
||||||
|
assert len(result["errors"]) == 0
|
||||||
|
|
||||||
|
# Test sequential processing
|
||||||
|
response = await client.post(
|
||||||
|
"/ingest/files",
|
||||||
|
files=files,
|
||||||
|
data={
|
||||||
|
"metadata": json.dumps({}),
|
||||||
|
"rules": json.dumps([]),
|
||||||
|
"use_colpali": "true",
|
||||||
|
"parallel": "false",
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert len(result["documents"]) == 3
|
||||||
|
assert len(result["errors"]) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_cross_document_query_with_graph(client: AsyncClient):
|
async def test_cross_document_query_with_graph(client: AsyncClient):
|
||||||
"""Test cross-document information retrieval using knowledge graph."""
|
"""Test cross-document information retrieval using knowledge graph."""
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO, IOBase
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@ -20,6 +21,8 @@ from .models import (
|
|||||||
)
|
)
|
||||||
from .rules import Rule
|
from .rules import Rule
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Type alias for rules
|
# Type alias for rules
|
||||||
RuleOrDict = Union[Rule, Dict[str, Any]]
|
RuleOrDict = Union[Rule, Dict[str, Any]]
|
||||||
|
|
||||||
@ -222,41 +225,7 @@ class AsyncDataBridge:
|
|||||||
rules: Optional[List[RuleOrDict]] = None,
|
rules: Optional[List[RuleOrDict]] = None,
|
||||||
use_colpali: bool = True,
|
use_colpali: bool = True,
|
||||||
) -> Document:
|
) -> Document:
|
||||||
"""
|
"""Ingest a file document into DataBridge."""
|
||||||
Ingest a file document into DataBridge.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file: File to ingest (path string, bytes, file object, or Path)
|
|
||||||
filename: Name of the file
|
|
||||||
metadata: Optional metadata dictionary
|
|
||||||
rules: Optional list of rules to apply during ingestion. Can be:
|
|
||||||
- MetadataExtractionRule: Extract metadata using a schema
|
|
||||||
- NaturalLanguageRule: Transform content using natural language
|
|
||||||
use_colpali: Whether to use ColPali-style embedding model to ingest the file (slower, but significantly better retrieval accuracy for text and images)
|
|
||||||
Returns:
|
|
||||||
Document: Metadata of the ingested document
|
|
||||||
|
|
||||||
Example:
|
|
||||||
```python
|
|
||||||
from databridge.rules import MetadataExtractionRule, NaturalLanguageRule
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
class DocumentInfo(BaseModel):
|
|
||||||
title: str
|
|
||||||
author: str
|
|
||||||
department: str
|
|
||||||
|
|
||||||
doc = await db.ingest_file(
|
|
||||||
"document.pdf",
|
|
||||||
filename="document.pdf",
|
|
||||||
metadata={"category": "research"},
|
|
||||||
rules=[
|
|
||||||
MetadataExtractionRule(schema=DocumentInfo),
|
|
||||||
NaturalLanguageRule(prompt="Extract key points only")
|
|
||||||
]
|
|
||||||
)
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
# Handle different file input types
|
# Handle different file input types
|
||||||
if isinstance(file, (str, Path)):
|
if isinstance(file, (str, Path)):
|
||||||
file_path = Path(file)
|
file_path = Path(file)
|
||||||
@ -290,6 +259,131 @@ class AsyncDataBridge:
|
|||||||
if isinstance(file, (str, Path)):
|
if isinstance(file, (str, Path)):
|
||||||
file_obj.close()
|
file_obj.close()
|
||||||
|
|
||||||
|
async def ingest_files(
|
||||||
|
self,
|
||||||
|
files: List[Union[str, bytes, BinaryIO, Path]],
|
||||||
|
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||||
|
rules: Optional[List[RuleOrDict]] = None,
|
||||||
|
use_colpali: bool = True,
|
||||||
|
parallel: bool = True,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Ingest multiple files into DataBridge.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: List of files to ingest (path strings, bytes, file objects, or Paths)
|
||||||
|
metadata: Optional metadata (single dict for all files or list of dicts)
|
||||||
|
rules: Optional list of rules to apply
|
||||||
|
use_colpali: Whether to use ColPali-style embedding
|
||||||
|
parallel: Whether to process files in parallel
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: List of successfully ingested documents
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If metadata list length doesn't match files length
|
||||||
|
"""
|
||||||
|
# Convert files to format expected by API
|
||||||
|
file_objects = []
|
||||||
|
for file in files:
|
||||||
|
if isinstance(file, (str, Path)):
|
||||||
|
path = Path(file)
|
||||||
|
file_objects.append(("files", (path.name, open(path, "rb"))))
|
||||||
|
elif isinstance(file, bytes):
|
||||||
|
file_objects.append(("files", ("file.bin", file)))
|
||||||
|
else:
|
||||||
|
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare request data
|
||||||
|
# Convert rules appropriately based on whether it's a flat list or list of lists
|
||||||
|
if rules:
|
||||||
|
if all(isinstance(r, list) for r in rules):
|
||||||
|
# List of lists - per-file rules
|
||||||
|
converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
|
||||||
|
else:
|
||||||
|
# Flat list - shared rules for all files
|
||||||
|
converted_rules = [self._convert_rule(r) for r in rules]
|
||||||
|
else:
|
||||||
|
converted_rules = []
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"metadata": json.dumps(metadata or {}),
|
||||||
|
"rules": json.dumps(converted_rules),
|
||||||
|
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
||||||
|
"parallel": str(parallel).lower(),
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await self._request("POST", "ingest/files", data=data, files=file_objects)
|
||||||
|
|
||||||
|
if response.get("errors"):
|
||||||
|
# Log errors but don't raise exception
|
||||||
|
for error in response["errors"]:
|
||||||
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
||||||
|
|
||||||
|
docs = [Document(**doc) for doc in response["documents"]]
|
||||||
|
for doc in docs:
|
||||||
|
doc._client = self
|
||||||
|
return docs
|
||||||
|
finally:
|
||||||
|
# Clean up file objects
|
||||||
|
for _, (_, file_obj) in file_objects:
|
||||||
|
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
|
async def ingest_directory(
|
||||||
|
self,
|
||||||
|
directory: Union[str, Path],
|
||||||
|
recursive: bool = False,
|
||||||
|
pattern: str = "*",
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
rules: Optional[List[RuleOrDict]] = None,
|
||||||
|
use_colpali: bool = True,
|
||||||
|
parallel: bool = True,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Ingest all files in a directory into DataBridge.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Path to directory containing files to ingest
|
||||||
|
recursive: Whether to recursively process subdirectories
|
||||||
|
pattern: Optional glob pattern to filter files (e.g. "*.pdf")
|
||||||
|
metadata: Optional metadata dictionary to apply to all files
|
||||||
|
rules: Optional list of rules to apply
|
||||||
|
use_colpali: Whether to use ColPali-style embedding
|
||||||
|
parallel: Whether to process files in parallel
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: List of ingested documents
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If directory not found
|
||||||
|
"""
|
||||||
|
directory = Path(directory)
|
||||||
|
if not directory.is_dir():
|
||||||
|
raise ValueError(f"Directory not found: {directory}")
|
||||||
|
|
||||||
|
# Collect all files matching pattern
|
||||||
|
if recursive:
|
||||||
|
files = list(directory.rglob(pattern))
|
||||||
|
else:
|
||||||
|
files = list(directory.glob(pattern))
|
||||||
|
|
||||||
|
# Filter out directories
|
||||||
|
files = [f for f in files if f.is_file()]
|
||||||
|
|
||||||
|
if not files:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Use ingest_files with collected paths
|
||||||
|
return await self.ingest_files(
|
||||||
|
files=files,
|
||||||
|
metadata=metadata,
|
||||||
|
rules=rules,
|
||||||
|
use_colpali=use_colpali,
|
||||||
|
parallel=parallel
|
||||||
|
)
|
||||||
|
|
||||||
async def retrieve_chunks(
|
async def retrieve_chunks(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
from typing import Dict, Any, List, Literal, Optional, Union
|
from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
|
||||||
from io import BinaryIO
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pydantic import BaseModel, Field, field_validator
|
from pydantic import BaseModel, Field, field_validator
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import base64
|
import base64
|
||||||
from io import BytesIO
|
from io import BytesIO, IOBase
|
||||||
import io
|
import io
|
||||||
from PIL.Image import Image as PILImage
|
from PIL.Image import Image as PILImage
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@ -23,6 +24,8 @@ from .models import (
|
|||||||
)
|
)
|
||||||
from .rules import Rule
|
from .rules import Rule
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Type alias for rules
|
# Type alias for rules
|
||||||
RuleOrDict = Union[Rule, Dict[str, Any]]
|
RuleOrDict = Union[Rule, Dict[str, Any]]
|
||||||
|
|
||||||
@ -294,6 +297,131 @@ class DataBridge:
|
|||||||
if isinstance(file, (str, Path)):
|
if isinstance(file, (str, Path)):
|
||||||
file_obj.close()
|
file_obj.close()
|
||||||
|
|
||||||
|
def ingest_files(
|
||||||
|
self,
|
||||||
|
files: List[Union[str, bytes, BinaryIO, Path]],
|
||||||
|
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||||
|
rules: Optional[List[RuleOrDict]] = None,
|
||||||
|
use_colpali: bool = True,
|
||||||
|
parallel: bool = True,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Ingest multiple files into DataBridge.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: List of files to ingest (path strings, bytes, file objects, or Paths)
|
||||||
|
metadata: Optional metadata (single dict for all files or list of dicts)
|
||||||
|
rules: Optional list of rules to apply
|
||||||
|
use_colpali: Whether to use ColPali-style embedding
|
||||||
|
parallel: Whether to process files in parallel
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: List of successfully ingested documents
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If metadata list length doesn't match files length
|
||||||
|
"""
|
||||||
|
# Convert files to format expected by API
|
||||||
|
file_objects = []
|
||||||
|
for file in files:
|
||||||
|
if isinstance(file, (str, Path)):
|
||||||
|
path = Path(file)
|
||||||
|
file_objects.append(("files", (path.name, open(path, "rb"))))
|
||||||
|
elif isinstance(file, bytes):
|
||||||
|
file_objects.append(("files", ("file.bin", file)))
|
||||||
|
else:
|
||||||
|
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare request data
|
||||||
|
# Convert rules appropriately based on whether it's a flat list or list of lists
|
||||||
|
if rules:
|
||||||
|
if all(isinstance(r, list) for r in rules):
|
||||||
|
# List of lists - per-file rules
|
||||||
|
converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
|
||||||
|
else:
|
||||||
|
# Flat list - shared rules for all files
|
||||||
|
converted_rules = [self._convert_rule(r) for r in rules]
|
||||||
|
else:
|
||||||
|
converted_rules = []
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"metadata": json.dumps(metadata or {}),
|
||||||
|
"rules": json.dumps(converted_rules),
|
||||||
|
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
||||||
|
"parallel": str(parallel).lower(),
|
||||||
|
}
|
||||||
|
|
||||||
|
response = self._request("POST", "ingest/files", data=data, files=file_objects)
|
||||||
|
|
||||||
|
if response.get("errors"):
|
||||||
|
# Log errors but don't raise exception
|
||||||
|
for error in response["errors"]:
|
||||||
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
||||||
|
|
||||||
|
docs = [Document(**doc) for doc in response["documents"]]
|
||||||
|
for doc in docs:
|
||||||
|
doc._client = self
|
||||||
|
return docs
|
||||||
|
finally:
|
||||||
|
# Clean up file objects
|
||||||
|
for _, (_, file_obj) in file_objects:
|
||||||
|
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
|
def ingest_directory(
|
||||||
|
self,
|
||||||
|
directory: Union[str, Path],
|
||||||
|
recursive: bool = False,
|
||||||
|
pattern: str = "*",
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
rules: Optional[List[RuleOrDict]] = None,
|
||||||
|
use_colpali: bool = True,
|
||||||
|
parallel: bool = True,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Ingest all files in a directory into DataBridge.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Path to directory containing files to ingest
|
||||||
|
recursive: Whether to recursively process subdirectories
|
||||||
|
pattern: Optional glob pattern to filter files (e.g. "*.pdf")
|
||||||
|
metadata: Optional metadata dictionary to apply to all files
|
||||||
|
rules: Optional list of rules to apply
|
||||||
|
use_colpali: Whether to use ColPali-style embedding
|
||||||
|
parallel: Whether to process files in parallel
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: List of ingested documents
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If directory not found
|
||||||
|
"""
|
||||||
|
directory = Path(directory)
|
||||||
|
if not directory.is_dir():
|
||||||
|
raise ValueError(f"Directory not found: {directory}")
|
||||||
|
|
||||||
|
# Collect all files matching pattern
|
||||||
|
if recursive:
|
||||||
|
files = list(directory.rglob(pattern))
|
||||||
|
else:
|
||||||
|
files = list(directory.glob(pattern))
|
||||||
|
|
||||||
|
# Filter out directories
|
||||||
|
files = [f for f in files if f.is_file()]
|
||||||
|
|
||||||
|
if not files:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Use ingest_files with collected paths
|
||||||
|
return self.ingest_files(
|
||||||
|
files=files,
|
||||||
|
metadata=metadata,
|
||||||
|
rules=rules,
|
||||||
|
use_colpali=use_colpali,
|
||||||
|
parallel=parallel
|
||||||
|
)
|
||||||
|
|
||||||
def retrieve_chunks(
|
def retrieve_chunks(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
|
114
shell.py
114
shell.py
@ -137,6 +137,120 @@ class DB:
|
|||||||
)
|
)
|
||||||
return doc if as_object else doc.model_dump()
|
return doc if as_object else doc.model_dump()
|
||||||
|
|
||||||
|
def ingest_files(
|
||||||
|
self,
|
||||||
|
files: List[str],
|
||||||
|
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||||
|
rules: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
use_colpali: bool = True,
|
||||||
|
parallel: bool = True,
|
||||||
|
as_objects: bool = False,
|
||||||
|
) -> List[Union[dict, 'Document']]:
|
||||||
|
"""
|
||||||
|
Batch ingest multiple files into DataBridge.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: List of file paths to ingest
|
||||||
|
metadata: Optional metadata (single dict for all files or list of dicts)
|
||||||
|
rules: Optional list of rules. Can be either:
|
||||||
|
- A single list of rules to apply to all files
|
||||||
|
- A list of rule lists, one per file
|
||||||
|
use_colpali: Whether to use ColPali-style embedding model
|
||||||
|
parallel: Whether to process files in parallel
|
||||||
|
as_objects: If True, returns Document objects with update methods, otherwise returns dicts
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of document metadata (dicts or Document objects)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
# Ingest multiple files with shared metadata
|
||||||
|
docs = db.ingest_files(
|
||||||
|
["doc1.pdf", "doc2.pdf"],
|
||||||
|
metadata={"category": "research"},
|
||||||
|
parallel=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ingest files with individual metadata
|
||||||
|
docs = db.ingest_files(
|
||||||
|
["doc1.pdf", "doc2.pdf"],
|
||||||
|
metadata=[
|
||||||
|
{"category": "research", "author": "Alice"},
|
||||||
|
{"category": "reports", "author": "Bob"}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
# Convert file paths to Path objects
|
||||||
|
file_paths = [Path(f) for f in files]
|
||||||
|
|
||||||
|
# Ingest files using the client
|
||||||
|
docs = self._client.ingest_files(
|
||||||
|
files=file_paths,
|
||||||
|
metadata=metadata,
|
||||||
|
rules=rules,
|
||||||
|
use_colpali=use_colpali,
|
||||||
|
parallel=parallel,
|
||||||
|
)
|
||||||
|
|
||||||
|
return docs if as_objects else [doc.model_dump() for doc in docs]
|
||||||
|
|
||||||
|
def ingest_directory(
|
||||||
|
self,
|
||||||
|
directory: str,
|
||||||
|
recursive: bool = False,
|
||||||
|
pattern: str = "*",
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
rules: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
use_colpali: bool = True,
|
||||||
|
parallel: bool = True,
|
||||||
|
as_objects: bool = False,
|
||||||
|
) -> List[Union[dict, 'Document']]:
|
||||||
|
"""
|
||||||
|
Ingest all files in a directory into DataBridge.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Path to directory containing files to ingest
|
||||||
|
recursive: Whether to recursively process subdirectories
|
||||||
|
pattern: Optional glob pattern to filter files (e.g. "*.pdf")
|
||||||
|
metadata: Optional metadata dictionary to apply to all files
|
||||||
|
rules: Optional list of rules. Can be either:
|
||||||
|
- A single list of rules to apply to all files
|
||||||
|
- A list of rule lists, one per file
|
||||||
|
use_colpali: Whether to use ColPali-style embedding model
|
||||||
|
parallel: Whether to process files in parallel
|
||||||
|
as_objects: If True, returns Document objects with update methods, otherwise returns dicts
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of document metadata (dicts or Document objects)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
# Ingest all PDFs in a directory and its subdirectories
|
||||||
|
docs = db.ingest_directory(
|
||||||
|
"data/documents",
|
||||||
|
recursive=True,
|
||||||
|
metadata={"category": "research"},
|
||||||
|
pattern="*.pdf"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
# Convert directory to Path
|
||||||
|
dir_path = Path(directory)
|
||||||
|
|
||||||
|
# Ingest directory using the client
|
||||||
|
docs = self._client.ingest_directory(
|
||||||
|
directory=dir_path,
|
||||||
|
recursive=recursive,
|
||||||
|
pattern=pattern,
|
||||||
|
metadata=metadata,
|
||||||
|
rules=rules,
|
||||||
|
use_colpali=use_colpali,
|
||||||
|
parallel=parallel,
|
||||||
|
)
|
||||||
|
|
||||||
|
return docs if as_objects else [doc.model_dump() for doc in docs]
|
||||||
|
|
||||||
def retrieve_chunks(
|
def retrieve_chunks(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user