Add delete document endpoint (#62)

This commit is contained in:
Adityavardhan Agrawal 2025-03-29 18:42:52 -07:00 committed by GitHub
parent 6a7f2586a0
commit 9ce0507616
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 434 additions and 2 deletions

View File

@ -640,6 +640,37 @@ async def get_document(document_id: str, auth: AuthContext = Depends(verify_toke
raise e
@app.delete("/documents/{document_id}")
async def delete_document(document_id: str, auth: AuthContext = Depends(verify_token)):
"""
Delete a document and all associated data.
This endpoint deletes a document and all its associated data, including:
- Document metadata
- Document content in storage
- Document chunks and embeddings in vector store
Args:
document_id: ID of the document to delete
auth: Authentication context (must have write access to the document)
Returns:
Deletion status
"""
try:
async with telemetry.track_operation(
operation_type="delete_document",
user_id=auth.entity_id,
metadata={"document_id": document_id},
):
success = await document_service.delete_document(document_id, auth)
if not success:
raise HTTPException(status_code=404, detail="Document not found or delete failed")
return {"status": "success", "message": f"Document {document_id} deleted successfully"}
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.get("/documents/filename/{filename}", response_model=Document)
async def get_document_by_filename(filename: str, auth: AuthContext = Depends(verify_token)):
"""Get document by filename."""

View File

@ -399,9 +399,9 @@ class PostgresDatabase(BaseDatabase):
return False
async def delete_document(self, document_id: str, auth: AuthContext) -> bool:
"""Delete document if user has admin access."""
"""Delete document if user has write access."""
try:
if not await self.check_access(document_id, auth, "admin"):
if not await self.check_access(document_id, auth, "write"):
return False
async with self.async_session() as session:

View File

@ -1307,6 +1307,95 @@ class DocumentService:
documents=documents,
)
async def delete_document(self, document_id: str, auth: AuthContext) -> bool:
"""
Delete a document and all its associated data.
This method:
1. Checks if the user has write access to the document
2. Gets the document to retrieve its chunk IDs
3. Deletes the document from the database
4. Deletes all associated chunks from the vector store (if possible)
5. Deletes the original file from storage if present
Args:
document_id: ID of the document to delete
auth: Authentication context
Returns:
bool: True if deletion was successful, False otherwise
Raises:
PermissionError: If the user doesn't have write access
"""
# First get the document to retrieve its chunk IDs
document = await self.db.get_document(document_id, auth)
if not document:
logger.error(f"Document {document_id} not found")
return False
# Verify write access - the database layer also checks this, but we check here too
# to avoid unnecessary operations if the user doesn't have permission
if not await self.db.check_access(document_id, auth, "write"):
logger.error(f"User {auth.entity_id} doesn't have write access to document {document_id}")
raise PermissionError(f"User doesn't have write access to document {document_id}")
# Delete document from database
db_success = await self.db.delete_document(document_id, auth)
if not db_success:
logger.error(f"Failed to delete document {document_id} from database")
return False
logger.info(f"Deleted document {document_id} from database")
# Try to delete chunks from vector store if they exist
if hasattr(document, 'chunk_ids') and document.chunk_ids:
try:
# Try to delete chunks by document ID
# Note: Some vector stores may not implement this method
if hasattr(self.vector_store, 'delete_chunks_by_document_id'):
await self.vector_store.delete_chunks_by_document_id(document_id)
logger.info(f"Deleted chunks for document {document_id} from vector store")
else:
logger.warning(f"Vector store does not support deleting chunks by document ID")
# Try to delete from colpali vector store as well
if self.colpali_vector_store and hasattr(self.colpali_vector_store, 'delete_chunks_by_document_id'):
await self.colpali_vector_store.delete_chunks_by_document_id(document_id)
logger.info(f"Deleted chunks for document {document_id} from colpali vector store")
except Exception as e:
logger.error(f"Error deleting chunks for document {document_id}: {e}")
# We continue even if chunk deletion fails - don't block document deletion
# Delete file from storage if it exists
if hasattr(document, 'storage_info') and document.storage_info:
try:
bucket = document.storage_info.get("bucket")
key = document.storage_info.get("key")
if bucket and key:
# Check if the storage provider supports deletion
if hasattr(self.storage, 'delete_file'):
await self.storage.delete_file(bucket, key)
logger.info(f"Deleted file for document {document_id} from storage (bucket: {bucket}, key: {key})")
else:
logger.warning(f"Storage provider does not support file deletion")
# Also handle the case of multiple file versions in storage_files
if hasattr(document, 'storage_files') and document.storage_files:
for file_info in document.storage_files:
bucket = file_info.get("bucket")
key = file_info.get("key")
if bucket and key and hasattr(self.storage, 'delete_file'):
await self.storage.delete_file(bucket, key)
logger.info(f"Deleted file version for document {document_id} from storage (bucket: {bucket}, key: {key})")
except Exception as e:
logger.error(f"Error deleting file for document {document_id}: {e}")
# We continue even if file deletion fails - don't block document deletion
logger.info(f"Successfully deleted document {document_id} and all associated data")
return True
def close(self):
"""Close all resources."""
# Close any active caches

View File

@ -1723,6 +1723,94 @@ async def test_batch_ingest_sequential_vs_parallel(
assert len(result["errors"]) == 0
@pytest.mark.asyncio
async def test_delete_document(client: AsyncClient):
"""Test deleting a document and verifying it's gone."""
# First ingest a document to delete
content = "This is a test document that will be deleted."
doc_id = await test_ingest_text_document(client, content=content)
headers = create_auth_header()
# Verify document exists
response = await client.get(f"/documents/{doc_id}", headers=headers)
assert response.status_code == 200
# Verify document is searchable
search_response = await client.post(
"/retrieve/chunks",
json={
"query": "test document deleted",
"filters": {"external_id": doc_id},
},
headers=headers,
)
assert search_response.status_code == 200
chunks = search_response.json()
assert len(chunks) > 0
# Delete document
delete_response = await client.delete(
f"/documents/{doc_id}",
headers=headers,
)
assert delete_response.status_code == 200
result = delete_response.json()
assert result["status"] == "success"
assert f"Document {doc_id} deleted successfully" in result["message"]
# Verify document no longer exists
response = await client.get(f"/documents/{doc_id}", headers=headers)
assert response.status_code == 404
# Verify document is no longer searchable
search_response = await client.post(
"/retrieve/chunks",
json={
"query": "test document deleted",
"filters": {"external_id": doc_id},
},
headers=headers,
)
assert search_response.status_code == 200
chunks = search_response.json()
assert len(chunks) == 0 # No chunks should be found
@pytest.mark.asyncio
async def test_delete_document_permission_error(client: AsyncClient):
"""Test permissions handling for document deletion."""
if get_settings().dev_mode:
pytest.skip("Auth tests skipped in dev mode")
# First ingest a document to delete
content = "This is a test document for testing delete permissions."
doc_id = await test_ingest_text_document(client, content=content)
# Try to delete with read-only permission
headers = create_auth_header(permissions=["read"])
delete_response = await client.delete(
f"/documents/{doc_id}",
headers=headers,
)
assert delete_response.status_code == 403
# Verify document still exists
headers = create_auth_header() # Full permissions
response = await client.get(f"/documents/{doc_id}", headers=headers)
assert response.status_code == 200
@pytest.mark.asyncio
async def test_delete_nonexistent_document(client: AsyncClient):
"""Test deleting a document that doesn't exist."""
headers = create_auth_header()
delete_response = await client.delete(
"/documents/nonexistent_document_id",
headers=headers,
)
assert delete_response.status_code == 404
@pytest.mark.asyncio
async def test_cross_document_query_with_graph(client: AsyncClient):
"""Test cross-document information retrieval using knowledge graph."""

View File

@ -34,3 +34,16 @@ class BaseVectorStore(ABC):
List of DocumentChunk objects
"""
pass
@abstractmethod
async def delete_chunks_by_document_id(self, document_id: str) -> bool:
"""
Delete all chunks associated with a document.
Args:
document_id: ID of the document whose chunks should be deleted
Returns:
bool: True if the operation was successful, False otherwise
"""
pass

View File

@ -328,6 +328,28 @@ class MultiVectorStore(BaseVectorStore):
logger.debug(f"Found {len(chunks)} chunks in batch retrieval from multi-vector store")
return chunks
async def delete_chunks_by_document_id(self, document_id: str) -> bool:
"""
Delete all chunks associated with a document.
Args:
document_id: ID of the document whose chunks should be deleted
Returns:
bool: True if the operation was successful, False otherwise
"""
try:
# Delete all chunks for the specified document
query = f"DELETE FROM multi_vector_embeddings WHERE document_id = '{document_id}'"
self.conn.execute(query)
logger.info(f"Deleted all chunks for document {document_id} from multi-vector store")
return True
except Exception as e:
logger.error(f"Error deleting chunks for document {document_id} from multi-vector store: {str(e)}")
return False
def close(self):
"""Close the database connection."""
if self.conn:

View File

@ -241,3 +241,27 @@ class PGVectorStore(BaseVectorStore):
except Exception as e:
logger.error(f"Error retrieving chunks by ID: {str(e)}")
return []
async def delete_chunks_by_document_id(self, document_id: str) -> bool:
"""
Delete all chunks associated with a document.
Args:
document_id: ID of the document whose chunks should be deleted
Returns:
bool: True if the operation was successful, False otherwise
"""
try:
async with self.async_session() as session:
# Delete all chunks for the specified document
query = text(f"DELETE FROM vector_embeddings WHERE document_id = :doc_id")
await session.execute(query, {"doc_id": document_id})
await session.commit()
logger.info(f"Deleted all chunks for document {document_id}")
return True
except Exception as e:
logger.error(f"Error deleting chunks for document {document_id}: {str(e)}")
return False

View File

@ -1228,6 +1228,57 @@ class AsyncDataBridge:
"""
response = await self._request("GET", "graphs")
return [Graph(**graph) for graph in response]
async def delete_document(self, document_id: str) -> Dict[str, str]:
"""
Delete a document and all its associated data.
This method deletes a document and all its associated data, including:
- Document metadata
- Document content in storage
- Document chunks and embeddings in vector store
Args:
document_id: ID of the document to delete
Returns:
Dict[str, str]: Deletion status
Example:
```python
# Delete a document
result = await db.delete_document("doc_123")
print(result["message"]) # Document doc_123 deleted successfully
```
"""
response = await self._request("DELETE", f"documents/{document_id}")
return response
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
"""
Delete a document by its filename.
This is a convenience method that first retrieves the document ID by filename
and then deletes the document by ID.
Args:
filename: Filename of the document to delete
Returns:
Dict[str, str]: Deletion status
Example:
```python
# Delete a document by filename
result = await db.delete_document_by_filename("report.pdf")
print(result["message"])
```
"""
# First get the document by filename to obtain its ID
doc = await self.get_document_by_filename(filename)
# Then delete the document by ID
return await self.delete_document(doc.external_id)
async def close(self):
"""Close the HTTP client"""

View File

@ -1259,6 +1259,57 @@ class DataBridge:
"""
response = self._request("GET", "graphs")
return [Graph(**graph) for graph in response]
def delete_document(self, document_id: str) -> Dict[str, str]:
"""
Delete a document and all its associated data.
This method deletes a document and all its associated data, including:
- Document metadata
- Document content in storage
- Document chunks and embeddings in vector store
Args:
document_id: ID of the document to delete
Returns:
Dict[str, str]: Deletion status
Example:
```python
# Delete a document
result = db.delete_document("doc_123")
print(result["message"]) # Document doc_123 deleted successfully
```
"""
response = self._request("DELETE", f"documents/{document_id}")
return response
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
"""
Delete a document by its filename.
This is a convenience method that first retrieves the document ID by filename
and then deletes the document by ID.
Args:
filename: Filename of the document to delete
Returns:
Dict[str, str]: Deletion status
Example:
```python
# Delete a document by filename
result = db.delete_document_by_filename("report.pdf")
print(result["message"])
```
"""
# First get the document by filename to obtain its ID
doc = self.get_document_by_filename(filename)
# Then delete the document by ID
return self.delete_document(doc.external_id)
def close(self):
"""Close the HTTP session"""

View File

@ -168,6 +168,36 @@ const DataBridgeUI = () => {
const handleDocumentClick = (document: Document) => {
fetchDocument(document.external_id);
};
// Handle document deletion
const handleDeleteDocument = async (documentId: string) => {
try {
setLoading(true);
setError(null);
const response = await fetch(`${API_BASE_URL}/documents/${documentId}`, {
method: 'DELETE',
headers
});
if (!response.ok) {
throw new Error(`Failed to delete document: ${response.statusText}`);
}
// Clear selected document if it was the one deleted
if (selectedDocument?.external_id === documentId) {
setSelectedDocument(null);
}
// Refresh documents list
await fetchDocuments();
} catch (err) {
setError(err instanceof Error ? err.message : 'An unknown error occurred');
} finally {
setLoading(false);
}
};
// Handle file upload
const handleFileUpload = async () => {
@ -762,6 +792,39 @@ const DataBridgeUI = () => {
</AccordionContent>
</AccordionItem>
</Accordion>
<div className="pt-4 border-t mt-4">
<Dialog>
<DialogTrigger asChild>
<Button variant="outline" size="sm" className="w-full border-red-500 text-red-500 hover:bg-red-50">
Delete Document
</Button>
</DialogTrigger>
<DialogContent>
<DialogHeader>
<DialogTitle>Delete Document</DialogTitle>
<DialogDescription>
Are you sure you want to delete this document? This action cannot be undone.
</DialogDescription>
</DialogHeader>
<div className="py-3">
<p className="font-medium">Document: {selectedDocument.filename || selectedDocument.external_id}</p>
<p className="text-sm text-gray-500 mt-1">ID: {selectedDocument.external_id}</p>
</div>
<DialogFooter>
<Button variant="outline" onClick={() => (document.querySelector('[data-state="open"] button[data-state="closed"]') as HTMLElement)?.click()}>Cancel</Button>
<Button
variant="outline"
className="border-red-500 text-red-500 hover:bg-red-50"
onClick={() => handleDeleteDocument(selectedDocument.external_id)}
disabled={loading}
>
{loading ? 'Deleting...' : 'Delete'}
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
</div>
</div>
</ScrollArea>
</CardContent>