diff --git a/core/api.py b/core/api.py index 0bf73f9..e2cf515 100644 --- a/core/api.py +++ b/core/api.py @@ -640,6 +640,37 @@ async def get_document(document_id: str, auth: AuthContext = Depends(verify_toke raise e +@app.delete("/documents/{document_id}") +async def delete_document(document_id: str, auth: AuthContext = Depends(verify_token)): + """ + Delete a document and all associated data. + + This endpoint deletes a document and all its associated data, including: + - Document metadata + - Document content in storage + - Document chunks and embeddings in vector store + + Args: + document_id: ID of the document to delete + auth: Authentication context (must have write access to the document) + + Returns: + Deletion status + """ + try: + async with telemetry.track_operation( + operation_type="delete_document", + user_id=auth.entity_id, + metadata={"document_id": document_id}, + ): + success = await document_service.delete_document(document_id, auth) + if not success: + raise HTTPException(status_code=404, detail="Document not found or delete failed") + return {"status": "success", "message": f"Document {document_id} deleted successfully"} + except PermissionError as e: + raise HTTPException(status_code=403, detail=str(e)) + + @app.get("/documents/filename/{filename}", response_model=Document) async def get_document_by_filename(filename: str, auth: AuthContext = Depends(verify_token)): """Get document by filename.""" diff --git a/core/database/postgres_database.py b/core/database/postgres_database.py index 2e0a780..a1461ba 100644 --- a/core/database/postgres_database.py +++ b/core/database/postgres_database.py @@ -399,9 +399,9 @@ class PostgresDatabase(BaseDatabase): return False async def delete_document(self, document_id: str, auth: AuthContext) -> bool: - """Delete document if user has admin access.""" + """Delete document if user has write access.""" try: - if not await self.check_access(document_id, auth, "admin"): + if not await self.check_access(document_id, auth, "write"): return False async with self.async_session() as session: diff --git a/core/services/document_service.py b/core/services/document_service.py index a339c85..9338803 100644 --- a/core/services/document_service.py +++ b/core/services/document_service.py @@ -1307,6 +1307,95 @@ class DocumentService: documents=documents, ) + async def delete_document(self, document_id: str, auth: AuthContext) -> bool: + """ + Delete a document and all its associated data. + + This method: + 1. Checks if the user has write access to the document + 2. Gets the document to retrieve its chunk IDs + 3. Deletes the document from the database + 4. Deletes all associated chunks from the vector store (if possible) + 5. Deletes the original file from storage if present + + Args: + document_id: ID of the document to delete + auth: Authentication context + + Returns: + bool: True if deletion was successful, False otherwise + + Raises: + PermissionError: If the user doesn't have write access + """ + # First get the document to retrieve its chunk IDs + document = await self.db.get_document(document_id, auth) + + if not document: + logger.error(f"Document {document_id} not found") + return False + + # Verify write access - the database layer also checks this, but we check here too + # to avoid unnecessary operations if the user doesn't have permission + if not await self.db.check_access(document_id, auth, "write"): + logger.error(f"User {auth.entity_id} doesn't have write access to document {document_id}") + raise PermissionError(f"User doesn't have write access to document {document_id}") + + # Delete document from database + db_success = await self.db.delete_document(document_id, auth) + if not db_success: + logger.error(f"Failed to delete document {document_id} from database") + return False + + logger.info(f"Deleted document {document_id} from database") + + # Try to delete chunks from vector store if they exist + if hasattr(document, 'chunk_ids') and document.chunk_ids: + try: + # Try to delete chunks by document ID + # Note: Some vector stores may not implement this method + if hasattr(self.vector_store, 'delete_chunks_by_document_id'): + await self.vector_store.delete_chunks_by_document_id(document_id) + logger.info(f"Deleted chunks for document {document_id} from vector store") + else: + logger.warning(f"Vector store does not support deleting chunks by document ID") + + # Try to delete from colpali vector store as well + if self.colpali_vector_store and hasattr(self.colpali_vector_store, 'delete_chunks_by_document_id'): + await self.colpali_vector_store.delete_chunks_by_document_id(document_id) + logger.info(f"Deleted chunks for document {document_id} from colpali vector store") + except Exception as e: + logger.error(f"Error deleting chunks for document {document_id}: {e}") + # We continue even if chunk deletion fails - don't block document deletion + + # Delete file from storage if it exists + if hasattr(document, 'storage_info') and document.storage_info: + try: + bucket = document.storage_info.get("bucket") + key = document.storage_info.get("key") + if bucket and key: + # Check if the storage provider supports deletion + if hasattr(self.storage, 'delete_file'): + await self.storage.delete_file(bucket, key) + logger.info(f"Deleted file for document {document_id} from storage (bucket: {bucket}, key: {key})") + else: + logger.warning(f"Storage provider does not support file deletion") + + # Also handle the case of multiple file versions in storage_files + if hasattr(document, 'storage_files') and document.storage_files: + for file_info in document.storage_files: + bucket = file_info.get("bucket") + key = file_info.get("key") + if bucket and key and hasattr(self.storage, 'delete_file'): + await self.storage.delete_file(bucket, key) + logger.info(f"Deleted file version for document {document_id} from storage (bucket: {bucket}, key: {key})") + except Exception as e: + logger.error(f"Error deleting file for document {document_id}: {e}") + # We continue even if file deletion fails - don't block document deletion + + logger.info(f"Successfully deleted document {document_id} and all associated data") + return True + def close(self): """Close all resources.""" # Close any active caches diff --git a/core/tests/integration/test_api.py b/core/tests/integration/test_api.py index b42ebd7..2358d65 100644 --- a/core/tests/integration/test_api.py +++ b/core/tests/integration/test_api.py @@ -1723,6 +1723,94 @@ async def test_batch_ingest_sequential_vs_parallel( assert len(result["errors"]) == 0 +@pytest.mark.asyncio +async def test_delete_document(client: AsyncClient): + """Test deleting a document and verifying it's gone.""" + # First ingest a document to delete + content = "This is a test document that will be deleted." + doc_id = await test_ingest_text_document(client, content=content) + + headers = create_auth_header() + + # Verify document exists + response = await client.get(f"/documents/{doc_id}", headers=headers) + assert response.status_code == 200 + + # Verify document is searchable + search_response = await client.post( + "/retrieve/chunks", + json={ + "query": "test document deleted", + "filters": {"external_id": doc_id}, + }, + headers=headers, + ) + assert search_response.status_code == 200 + chunks = search_response.json() + assert len(chunks) > 0 + + # Delete document + delete_response = await client.delete( + f"/documents/{doc_id}", + headers=headers, + ) + assert delete_response.status_code == 200 + result = delete_response.json() + assert result["status"] == "success" + assert f"Document {doc_id} deleted successfully" in result["message"] + + # Verify document no longer exists + response = await client.get(f"/documents/{doc_id}", headers=headers) + assert response.status_code == 404 + + # Verify document is no longer searchable + search_response = await client.post( + "/retrieve/chunks", + json={ + "query": "test document deleted", + "filters": {"external_id": doc_id}, + }, + headers=headers, + ) + assert search_response.status_code == 200 + chunks = search_response.json() + assert len(chunks) == 0 # No chunks should be found + +@pytest.mark.asyncio +async def test_delete_document_permission_error(client: AsyncClient): + """Test permissions handling for document deletion.""" + if get_settings().dev_mode: + pytest.skip("Auth tests skipped in dev mode") + + # First ingest a document to delete + content = "This is a test document for testing delete permissions." + doc_id = await test_ingest_text_document(client, content=content) + + # Try to delete with read-only permission + headers = create_auth_header(permissions=["read"]) + + delete_response = await client.delete( + f"/documents/{doc_id}", + headers=headers, + ) + assert delete_response.status_code == 403 + + # Verify document still exists + headers = create_auth_header() # Full permissions + response = await client.get(f"/documents/{doc_id}", headers=headers) + assert response.status_code == 200 + +@pytest.mark.asyncio +async def test_delete_nonexistent_document(client: AsyncClient): + """Test deleting a document that doesn't exist.""" + headers = create_auth_header() + + delete_response = await client.delete( + "/documents/nonexistent_document_id", + headers=headers, + ) + assert delete_response.status_code == 404 + @pytest.mark.asyncio async def test_cross_document_query_with_graph(client: AsyncClient): """Test cross-document information retrieval using knowledge graph.""" diff --git a/core/vector_store/base_vector_store.py b/core/vector_store/base_vector_store.py index 3566185..71642e7 100644 --- a/core/vector_store/base_vector_store.py +++ b/core/vector_store/base_vector_store.py @@ -34,3 +34,16 @@ class BaseVectorStore(ABC): List of DocumentChunk objects """ pass + + @abstractmethod + async def delete_chunks_by_document_id(self, document_id: str) -> bool: + """ + Delete all chunks associated with a document. + + Args: + document_id: ID of the document whose chunks should be deleted + + Returns: + bool: True if the operation was successful, False otherwise + """ + pass diff --git a/core/vector_store/multi_vector_store.py b/core/vector_store/multi_vector_store.py index 9626f0c..d10ff7a 100644 --- a/core/vector_store/multi_vector_store.py +++ b/core/vector_store/multi_vector_store.py @@ -328,6 +328,28 @@ class MultiVectorStore(BaseVectorStore): logger.debug(f"Found {len(chunks)} chunks in batch retrieval from multi-vector store") return chunks + async def delete_chunks_by_document_id(self, document_id: str) -> bool: + """ + Delete all chunks associated with a document. + + Args: + document_id: ID of the document whose chunks should be deleted + + Returns: + bool: True if the operation was successful, False otherwise + """ + try: + # Delete all chunks for the specified document + query = f"DELETE FROM multi_vector_embeddings WHERE document_id = '{document_id}'" + self.conn.execute(query) + + logger.info(f"Deleted all chunks for document {document_id} from multi-vector store") + return True + + except Exception as e: + logger.error(f"Error deleting chunks for document {document_id} from multi-vector store: {str(e)}") + return False + def close(self): """Close the database connection.""" if self.conn: diff --git a/core/vector_store/pgvector_store.py b/core/vector_store/pgvector_store.py index 8e15410..d3890ff 100644 --- a/core/vector_store/pgvector_store.py +++ b/core/vector_store/pgvector_store.py @@ -241,3 +241,27 @@ class PGVectorStore(BaseVectorStore): except Exception as e: logger.error(f"Error retrieving chunks by ID: {str(e)}") return [] + + async def delete_chunks_by_document_id(self, document_id: str) -> bool: + """ + Delete all chunks associated with a document. + + Args: + document_id: ID of the document whose chunks should be deleted + + Returns: + bool: True if the operation was successful, False otherwise + """ + try: + async with self.async_session() as session: + # Delete all chunks for the specified document + query = text(f"DELETE FROM vector_embeddings WHERE document_id = :doc_id") + await session.execute(query, {"doc_id": document_id}) + await session.commit() + + logger.info(f"Deleted all chunks for document {document_id}") + return True + + except Exception as e: + logger.error(f"Error deleting chunks for document {document_id}: {str(e)}") + return False diff --git a/sdks/python/databridge/async_.py b/sdks/python/databridge/async_.py index 7f879c4..97e0966 100644 --- a/sdks/python/databridge/async_.py +++ b/sdks/python/databridge/async_.py @@ -1228,6 +1228,57 @@ class AsyncDataBridge: """ response = await self._request("GET", "graphs") return [Graph(**graph) for graph in response] + + async def delete_document(self, document_id: str) -> Dict[str, str]: + """ + Delete a document and all its associated data. + + This method deletes a document and all its associated data, including: + - Document metadata + - Document content in storage + - Document chunks and embeddings in vector store + + Args: + document_id: ID of the document to delete + + Returns: + Dict[str, str]: Deletion status + + Example: + ```python + # Delete a document + result = await db.delete_document("doc_123") + print(result["message"]) # Document doc_123 deleted successfully + ``` + """ + response = await self._request("DELETE", f"documents/{document_id}") + return response + + async def delete_document_by_filename(self, filename: str) -> Dict[str, str]: + """ + Delete a document by its filename. + + This is a convenience method that first retrieves the document ID by filename + and then deletes the document by ID. + + Args: + filename: Filename of the document to delete + + Returns: + Dict[str, str]: Deletion status + + Example: + ```python + # Delete a document by filename + result = await db.delete_document_by_filename("report.pdf") + print(result["message"]) + ``` + """ + # First get the document by filename to obtain its ID + doc = await self.get_document_by_filename(filename) + + # Then delete the document by ID + return await self.delete_document(doc.external_id) async def close(self): """Close the HTTP client""" diff --git a/sdks/python/databridge/sync.py b/sdks/python/databridge/sync.py index 2f757b3..5a9a60e 100644 --- a/sdks/python/databridge/sync.py +++ b/sdks/python/databridge/sync.py @@ -1259,6 +1259,57 @@ class DataBridge: """ response = self._request("GET", "graphs") return [Graph(**graph) for graph in response] + + def delete_document(self, document_id: str) -> Dict[str, str]: + """ + Delete a document and all its associated data. + + This method deletes a document and all its associated data, including: + - Document metadata + - Document content in storage + - Document chunks and embeddings in vector store + + Args: + document_id: ID of the document to delete + + Returns: + Dict[str, str]: Deletion status + + Example: + ```python + # Delete a document + result = db.delete_document("doc_123") + print(result["message"]) # Document doc_123 deleted successfully + ``` + """ + response = self._request("DELETE", f"documents/{document_id}") + return response + + def delete_document_by_filename(self, filename: str) -> Dict[str, str]: + """ + Delete a document by its filename. + + This is a convenience method that first retrieves the document ID by filename + and then deletes the document by ID. + + Args: + filename: Filename of the document to delete + + Returns: + Dict[str, str]: Deletion status + + Example: + ```python + # Delete a document by filename + result = db.delete_document_by_filename("report.pdf") + print(result["message"]) + ``` + """ + # First get the document by filename to obtain its ID + doc = self.get_document_by_filename(filename) + + # Then delete the document by ID + return self.delete_document(doc.external_id) def close(self): """Close the HTTP session""" diff --git a/ui-component/app/page.tsx b/ui-component/app/page.tsx index 101b2fa..f971af1 100644 --- a/ui-component/app/page.tsx +++ b/ui-component/app/page.tsx @@ -168,6 +168,36 @@ const DataBridgeUI = () => { const handleDocumentClick = (document: Document) => { fetchDocument(document.external_id); }; + + // Handle document deletion + const handleDeleteDocument = async (documentId: string) => { + try { + setLoading(true); + setError(null); + + const response = await fetch(`${API_BASE_URL}/documents/${documentId}`, { + method: 'DELETE', + headers + }); + + if (!response.ok) { + throw new Error(`Failed to delete document: ${response.statusText}`); + } + + // Clear selected document if it was the one deleted + if (selectedDocument?.external_id === documentId) { + setSelectedDocument(null); + } + + // Refresh documents list + await fetchDocuments(); + + } catch (err) { + setError(err instanceof Error ? err.message : 'An unknown error occurred'); + } finally { + setLoading(false); + } + }; // Handle file upload const handleFileUpload = async () => { @@ -762,6 +792,39 @@ const DataBridgeUI = () => { + +