Minor change to use SearchIndexingBufferedSender to support optimized batch indexing (#712)

This commit is contained in:
Farzad Sunavala 2023-09-26 18:43:05 -05:00 committed by GitHub
parent c777f1025a
commit 552262ea89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -55,6 +55,7 @@
"from azure.search.documents import SearchClient \n", "from azure.search.documents import SearchClient \n",
"from azure.search.documents.indexes import SearchIndexClient \n", "from azure.search.documents.indexes import SearchIndexClient \n",
"from azure.search.documents.models import Vector \n", "from azure.search.documents.models import Vector \n",
"from azure.search.documents import SearchIndexingBufferedSender\n",
"from azure.search.documents.indexes.models import ( \n", "from azure.search.documents.indexes.models import ( \n",
" SearchIndex, \n", " SearchIndex, \n",
" SearchField, \n", " SearchField, \n",
@ -69,7 +70,7 @@
" SemanticSettings, \n", " SemanticSettings, \n",
" VectorSearch, \n", " VectorSearch, \n",
" HnswVectorSearchAlgorithmConfiguration, \n", " HnswVectorSearchAlgorithmConfiguration, \n",
") \n" ")"
] ]
}, },
{ {
@ -394,26 +395,19 @@
"# Convert the DataFrame to a list of dictionaries \n", "# Convert the DataFrame to a list of dictionaries \n",
"documents = article_df.to_dict(orient='records') \n", "documents = article_df.to_dict(orient='records') \n",
" \n", " \n",
"search_client = SearchClient(endpoint=search_service_endpoint, index_name=index_name, credential=credential) \n", "# Use SearchIndexingBufferedSender to upload the documents in batches optimized for indexing \n",
"with SearchIndexingBufferedSender(search_service_endpoint, index_name, AzureKeyCredential(search_service_api_key)) as batch_client: \n",
" # Add upload actions for all documents \n",
" batch_client.upload_documents(documents=documents) \n",
" \n", " \n",
"# Define the batch upload size \n", "print(f\"Uploaded {len(documents)} documents in total\") "
"batch_size = 250 \n",
" \n",
"# Split the documents into batches \n",
"batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)] \n",
" \n",
"# Upload each batch of documents \n",
"for batch in batches: \n",
" result = search_client.upload_documents(batch) \n",
" \n",
"print(f\"Uploaded {len(documents)} documents in total\") \n"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"If you dataset didn't already contain pre-computed embeddings, you can create embeddings by using the below function using the `openai` python library. You'll also notice the same function and model are being used to generate query embeddings for performing vector searches." "If your dataset didn't already contain pre-computed embeddings, you can create embeddings by using the below function using the `openai` python library. You'll also notice the same function and model are being used to generate query embeddings for performing vector searches."
] ]
}, },
{ {