mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
Add delete document endpoint (#62)
This commit is contained in:
parent
6a7f2586a0
commit
9ce0507616
31
core/api.py
31
core/api.py
@ -640,6 +640,37 @@ async def get_document(document_id: str, auth: AuthContext = Depends(verify_toke
|
||||
raise e
|
||||
|
||||
|
||||
@app.delete("/documents/{document_id}")
|
||||
async def delete_document(document_id: str, auth: AuthContext = Depends(verify_token)):
|
||||
"""
|
||||
Delete a document and all associated data.
|
||||
|
||||
This endpoint deletes a document and all its associated data, including:
|
||||
- Document metadata
|
||||
- Document content in storage
|
||||
- Document chunks and embeddings in vector store
|
||||
|
||||
Args:
|
||||
document_id: ID of the document to delete
|
||||
auth: Authentication context (must have write access to the document)
|
||||
|
||||
Returns:
|
||||
Deletion status
|
||||
"""
|
||||
try:
|
||||
async with telemetry.track_operation(
|
||||
operation_type="delete_document",
|
||||
user_id=auth.entity_id,
|
||||
metadata={"document_id": document_id},
|
||||
):
|
||||
success = await document_service.delete_document(document_id, auth)
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Document not found or delete failed")
|
||||
return {"status": "success", "message": f"Document {document_id} deleted successfully"}
|
||||
except PermissionError as e:
|
||||
raise HTTPException(status_code=403, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/documents/filename/{filename}", response_model=Document)
|
||||
async def get_document_by_filename(filename: str, auth: AuthContext = Depends(verify_token)):
|
||||
"""Get document by filename."""
|
||||
|
@ -399,9 +399,9 @@ class PostgresDatabase(BaseDatabase):
|
||||
return False
|
||||
|
||||
async def delete_document(self, document_id: str, auth: AuthContext) -> bool:
|
||||
"""Delete document if user has admin access."""
|
||||
"""Delete document if user has write access."""
|
||||
try:
|
||||
if not await self.check_access(document_id, auth, "admin"):
|
||||
if not await self.check_access(document_id, auth, "write"):
|
||||
return False
|
||||
|
||||
async with self.async_session() as session:
|
||||
|
@ -1307,6 +1307,95 @@ class DocumentService:
|
||||
documents=documents,
|
||||
)
|
||||
|
||||
async def delete_document(self, document_id: str, auth: AuthContext) -> bool:
|
||||
"""
|
||||
Delete a document and all its associated data.
|
||||
|
||||
This method:
|
||||
1. Checks if the user has write access to the document
|
||||
2. Gets the document to retrieve its chunk IDs
|
||||
3. Deletes the document from the database
|
||||
4. Deletes all associated chunks from the vector store (if possible)
|
||||
5. Deletes the original file from storage if present
|
||||
|
||||
Args:
|
||||
document_id: ID of the document to delete
|
||||
auth: Authentication context
|
||||
|
||||
Returns:
|
||||
bool: True if deletion was successful, False otherwise
|
||||
|
||||
Raises:
|
||||
PermissionError: If the user doesn't have write access
|
||||
"""
|
||||
# First get the document to retrieve its chunk IDs
|
||||
document = await self.db.get_document(document_id, auth)
|
||||
|
||||
if not document:
|
||||
logger.error(f"Document {document_id} not found")
|
||||
return False
|
||||
|
||||
# Verify write access - the database layer also checks this, but we check here too
|
||||
# to avoid unnecessary operations if the user doesn't have permission
|
||||
if not await self.db.check_access(document_id, auth, "write"):
|
||||
logger.error(f"User {auth.entity_id} doesn't have write access to document {document_id}")
|
||||
raise PermissionError(f"User doesn't have write access to document {document_id}")
|
||||
|
||||
# Delete document from database
|
||||
db_success = await self.db.delete_document(document_id, auth)
|
||||
if not db_success:
|
||||
logger.error(f"Failed to delete document {document_id} from database")
|
||||
return False
|
||||
|
||||
logger.info(f"Deleted document {document_id} from database")
|
||||
|
||||
# Try to delete chunks from vector store if they exist
|
||||
if hasattr(document, 'chunk_ids') and document.chunk_ids:
|
||||
try:
|
||||
# Try to delete chunks by document ID
|
||||
# Note: Some vector stores may not implement this method
|
||||
if hasattr(self.vector_store, 'delete_chunks_by_document_id'):
|
||||
await self.vector_store.delete_chunks_by_document_id(document_id)
|
||||
logger.info(f"Deleted chunks for document {document_id} from vector store")
|
||||
else:
|
||||
logger.warning(f"Vector store does not support deleting chunks by document ID")
|
||||
|
||||
# Try to delete from colpali vector store as well
|
||||
if self.colpali_vector_store and hasattr(self.colpali_vector_store, 'delete_chunks_by_document_id'):
|
||||
await self.colpali_vector_store.delete_chunks_by_document_id(document_id)
|
||||
logger.info(f"Deleted chunks for document {document_id} from colpali vector store")
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting chunks for document {document_id}: {e}")
|
||||
# We continue even if chunk deletion fails - don't block document deletion
|
||||
|
||||
# Delete file from storage if it exists
|
||||
if hasattr(document, 'storage_info') and document.storage_info:
|
||||
try:
|
||||
bucket = document.storage_info.get("bucket")
|
||||
key = document.storage_info.get("key")
|
||||
if bucket and key:
|
||||
# Check if the storage provider supports deletion
|
||||
if hasattr(self.storage, 'delete_file'):
|
||||
await self.storage.delete_file(bucket, key)
|
||||
logger.info(f"Deleted file for document {document_id} from storage (bucket: {bucket}, key: {key})")
|
||||
else:
|
||||
logger.warning(f"Storage provider does not support file deletion")
|
||||
|
||||
# Also handle the case of multiple file versions in storage_files
|
||||
if hasattr(document, 'storage_files') and document.storage_files:
|
||||
for file_info in document.storage_files:
|
||||
bucket = file_info.get("bucket")
|
||||
key = file_info.get("key")
|
||||
if bucket and key and hasattr(self.storage, 'delete_file'):
|
||||
await self.storage.delete_file(bucket, key)
|
||||
logger.info(f"Deleted file version for document {document_id} from storage (bucket: {bucket}, key: {key})")
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting file for document {document_id}: {e}")
|
||||
# We continue even if file deletion fails - don't block document deletion
|
||||
|
||||
logger.info(f"Successfully deleted document {document_id} and all associated data")
|
||||
return True
|
||||
|
||||
def close(self):
|
||||
"""Close all resources."""
|
||||
# Close any active caches
|
||||
|
@ -1723,6 +1723,94 @@ async def test_batch_ingest_sequential_vs_parallel(
|
||||
assert len(result["errors"]) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delete_document(client: AsyncClient):
|
||||
"""Test deleting a document and verifying it's gone."""
|
||||
# First ingest a document to delete
|
||||
content = "This is a test document that will be deleted."
|
||||
doc_id = await test_ingest_text_document(client, content=content)
|
||||
|
||||
headers = create_auth_header()
|
||||
|
||||
# Verify document exists
|
||||
response = await client.get(f"/documents/{doc_id}", headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
# Verify document is searchable
|
||||
search_response = await client.post(
|
||||
"/retrieve/chunks",
|
||||
json={
|
||||
"query": "test document deleted",
|
||||
"filters": {"external_id": doc_id},
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
assert search_response.status_code == 200
|
||||
chunks = search_response.json()
|
||||
assert len(chunks) > 0
|
||||
|
||||
# Delete document
|
||||
delete_response = await client.delete(
|
||||
f"/documents/{doc_id}",
|
||||
headers=headers,
|
||||
)
|
||||
assert delete_response.status_code == 200
|
||||
result = delete_response.json()
|
||||
assert result["status"] == "success"
|
||||
assert f"Document {doc_id} deleted successfully" in result["message"]
|
||||
|
||||
# Verify document no longer exists
|
||||
response = await client.get(f"/documents/{doc_id}", headers=headers)
|
||||
assert response.status_code == 404
|
||||
|
||||
# Verify document is no longer searchable
|
||||
search_response = await client.post(
|
||||
"/retrieve/chunks",
|
||||
json={
|
||||
"query": "test document deleted",
|
||||
"filters": {"external_id": doc_id},
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
assert search_response.status_code == 200
|
||||
chunks = search_response.json()
|
||||
assert len(chunks) == 0 # No chunks should be found
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delete_document_permission_error(client: AsyncClient):
|
||||
"""Test permissions handling for document deletion."""
|
||||
if get_settings().dev_mode:
|
||||
pytest.skip("Auth tests skipped in dev mode")
|
||||
|
||||
# First ingest a document to delete
|
||||
content = "This is a test document for testing delete permissions."
|
||||
doc_id = await test_ingest_text_document(client, content=content)
|
||||
|
||||
# Try to delete with read-only permission
|
||||
headers = create_auth_header(permissions=["read"])
|
||||
|
||||
delete_response = await client.delete(
|
||||
f"/documents/{doc_id}",
|
||||
headers=headers,
|
||||
)
|
||||
assert delete_response.status_code == 403
|
||||
|
||||
# Verify document still exists
|
||||
headers = create_auth_header() # Full permissions
|
||||
response = await client.get(f"/documents/{doc_id}", headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delete_nonexistent_document(client: AsyncClient):
|
||||
"""Test deleting a document that doesn't exist."""
|
||||
headers = create_auth_header()
|
||||
|
||||
delete_response = await client.delete(
|
||||
"/documents/nonexistent_document_id",
|
||||
headers=headers,
|
||||
)
|
||||
assert delete_response.status_code == 404
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cross_document_query_with_graph(client: AsyncClient):
|
||||
"""Test cross-document information retrieval using knowledge graph."""
|
||||
|
@ -34,3 +34,16 @@ class BaseVectorStore(ABC):
|
||||
List of DocumentChunk objects
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def delete_chunks_by_document_id(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete all chunks associated with a document.
|
||||
|
||||
Args:
|
||||
document_id: ID of the document whose chunks should be deleted
|
||||
|
||||
Returns:
|
||||
bool: True if the operation was successful, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
@ -328,6 +328,28 @@ class MultiVectorStore(BaseVectorStore):
|
||||
logger.debug(f"Found {len(chunks)} chunks in batch retrieval from multi-vector store")
|
||||
return chunks
|
||||
|
||||
async def delete_chunks_by_document_id(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete all chunks associated with a document.
|
||||
|
||||
Args:
|
||||
document_id: ID of the document whose chunks should be deleted
|
||||
|
||||
Returns:
|
||||
bool: True if the operation was successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Delete all chunks for the specified document
|
||||
query = f"DELETE FROM multi_vector_embeddings WHERE document_id = '{document_id}'"
|
||||
self.conn.execute(query)
|
||||
|
||||
logger.info(f"Deleted all chunks for document {document_id} from multi-vector store")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting chunks for document {document_id} from multi-vector store: {str(e)}")
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
"""Close the database connection."""
|
||||
if self.conn:
|
||||
|
@ -241,3 +241,27 @@ class PGVectorStore(BaseVectorStore):
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrieving chunks by ID: {str(e)}")
|
||||
return []
|
||||
|
||||
async def delete_chunks_by_document_id(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete all chunks associated with a document.
|
||||
|
||||
Args:
|
||||
document_id: ID of the document whose chunks should be deleted
|
||||
|
||||
Returns:
|
||||
bool: True if the operation was successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
async with self.async_session() as session:
|
||||
# Delete all chunks for the specified document
|
||||
query = text(f"DELETE FROM vector_embeddings WHERE document_id = :doc_id")
|
||||
await session.execute(query, {"doc_id": document_id})
|
||||
await session.commit()
|
||||
|
||||
logger.info(f"Deleted all chunks for document {document_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting chunks for document {document_id}: {str(e)}")
|
||||
return False
|
||||
|
@ -1228,6 +1228,57 @@ class AsyncDataBridge:
|
||||
"""
|
||||
response = await self._request("GET", "graphs")
|
||||
return [Graph(**graph) for graph in response]
|
||||
|
||||
async def delete_document(self, document_id: str) -> Dict[str, str]:
|
||||
"""
|
||||
Delete a document and all its associated data.
|
||||
|
||||
This method deletes a document and all its associated data, including:
|
||||
- Document metadata
|
||||
- Document content in storage
|
||||
- Document chunks and embeddings in vector store
|
||||
|
||||
Args:
|
||||
document_id: ID of the document to delete
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Deletion status
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Delete a document
|
||||
result = await db.delete_document("doc_123")
|
||||
print(result["message"]) # Document doc_123 deleted successfully
|
||||
```
|
||||
"""
|
||||
response = await self._request("DELETE", f"documents/{document_id}")
|
||||
return response
|
||||
|
||||
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
||||
"""
|
||||
Delete a document by its filename.
|
||||
|
||||
This is a convenience method that first retrieves the document ID by filename
|
||||
and then deletes the document by ID.
|
||||
|
||||
Args:
|
||||
filename: Filename of the document to delete
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Deletion status
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Delete a document by filename
|
||||
result = await db.delete_document_by_filename("report.pdf")
|
||||
print(result["message"])
|
||||
```
|
||||
"""
|
||||
# First get the document by filename to obtain its ID
|
||||
doc = await self.get_document_by_filename(filename)
|
||||
|
||||
# Then delete the document by ID
|
||||
return await self.delete_document(doc.external_id)
|
||||
|
||||
async def close(self):
|
||||
"""Close the HTTP client"""
|
||||
|
@ -1259,6 +1259,57 @@ class DataBridge:
|
||||
"""
|
||||
response = self._request("GET", "graphs")
|
||||
return [Graph(**graph) for graph in response]
|
||||
|
||||
def delete_document(self, document_id: str) -> Dict[str, str]:
|
||||
"""
|
||||
Delete a document and all its associated data.
|
||||
|
||||
This method deletes a document and all its associated data, including:
|
||||
- Document metadata
|
||||
- Document content in storage
|
||||
- Document chunks and embeddings in vector store
|
||||
|
||||
Args:
|
||||
document_id: ID of the document to delete
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Deletion status
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Delete a document
|
||||
result = db.delete_document("doc_123")
|
||||
print(result["message"]) # Document doc_123 deleted successfully
|
||||
```
|
||||
"""
|
||||
response = self._request("DELETE", f"documents/{document_id}")
|
||||
return response
|
||||
|
||||
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
||||
"""
|
||||
Delete a document by its filename.
|
||||
|
||||
This is a convenience method that first retrieves the document ID by filename
|
||||
and then deletes the document by ID.
|
||||
|
||||
Args:
|
||||
filename: Filename of the document to delete
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Deletion status
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Delete a document by filename
|
||||
result = db.delete_document_by_filename("report.pdf")
|
||||
print(result["message"])
|
||||
```
|
||||
"""
|
||||
# First get the document by filename to obtain its ID
|
||||
doc = self.get_document_by_filename(filename)
|
||||
|
||||
# Then delete the document by ID
|
||||
return self.delete_document(doc.external_id)
|
||||
|
||||
def close(self):
|
||||
"""Close the HTTP session"""
|
||||
|
@ -168,6 +168,36 @@ const DataBridgeUI = () => {
|
||||
const handleDocumentClick = (document: Document) => {
|
||||
fetchDocument(document.external_id);
|
||||
};
|
||||
|
||||
// Handle document deletion
|
||||
const handleDeleteDocument = async (documentId: string) => {
|
||||
try {
|
||||
setLoading(true);
|
||||
setError(null);
|
||||
|
||||
const response = await fetch(`${API_BASE_URL}/documents/${documentId}`, {
|
||||
method: 'DELETE',
|
||||
headers
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to delete document: ${response.statusText}`);
|
||||
}
|
||||
|
||||
// Clear selected document if it was the one deleted
|
||||
if (selectedDocument?.external_id === documentId) {
|
||||
setSelectedDocument(null);
|
||||
}
|
||||
|
||||
// Refresh documents list
|
||||
await fetchDocuments();
|
||||
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : 'An unknown error occurred');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
// Handle file upload
|
||||
const handleFileUpload = async () => {
|
||||
@ -762,6 +792,39 @@ const DataBridgeUI = () => {
|
||||
</AccordionContent>
|
||||
</AccordionItem>
|
||||
</Accordion>
|
||||
|
||||
<div className="pt-4 border-t mt-4">
|
||||
<Dialog>
|
||||
<DialogTrigger asChild>
|
||||
<Button variant="outline" size="sm" className="w-full border-red-500 text-red-500 hover:bg-red-50">
|
||||
Delete Document
|
||||
</Button>
|
||||
</DialogTrigger>
|
||||
<DialogContent>
|
||||
<DialogHeader>
|
||||
<DialogTitle>Delete Document</DialogTitle>
|
||||
<DialogDescription>
|
||||
Are you sure you want to delete this document? This action cannot be undone.
|
||||
</DialogDescription>
|
||||
</DialogHeader>
|
||||
<div className="py-3">
|
||||
<p className="font-medium">Document: {selectedDocument.filename || selectedDocument.external_id}</p>
|
||||
<p className="text-sm text-gray-500 mt-1">ID: {selectedDocument.external_id}</p>
|
||||
</div>
|
||||
<DialogFooter>
|
||||
<Button variant="outline" onClick={() => (document.querySelector('[data-state="open"] button[data-state="closed"]') as HTMLElement)?.click()}>Cancel</Button>
|
||||
<Button
|
||||
variant="outline"
|
||||
className="border-red-500 text-red-500 hover:bg-red-50"
|
||||
onClick={() => handleDeleteDocument(selectedDocument.external_id)}
|
||||
disabled={loading}
|
||||
>
|
||||
{loading ? 'Deleting...' : 'Delete'}
|
||||
</Button>
|
||||
</DialogFooter>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
</div>
|
||||
</div>
|
||||
</ScrollArea>
|
||||
</CardContent>
|
||||
|
Loading…
x
Reference in New Issue
Block a user