diff --git a/examples/vector_databases/qdrant/Using_Qdrant_for_embeddings_search.ipynb b/examples/vector_databases/qdrant/Using_Qdrant_for_embeddings_search.ipynb index dbb214e..c2025f4 100644 --- a/examples/vector_databases/qdrant/Using_Qdrant_for_embeddings_search.ipynb +++ b/examples/vector_databases/qdrant/Using_Qdrant_for_embeddings_search.ipynb @@ -44,67 +44,32 @@ "cell_type": "code", "execution_count": 1, "id": "8d8810f9", - "metadata": { - "ExecuteTime": { - "end_time": "2023-06-29T12:59:21.344233180Z", - "start_time": "2023-06-29T12:59:00.815088712Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting qdrant-client\r\n", - " ...\r\n", - "Successfully installed certifi-2023.5.7 grpcio-1.56.0 grpcio-tools-1.56.0 h11-0.14.0 h2-4.1.0 hpack-4.0.0 httpcore-0.17.2 httpx-0.24.1 hyperframe-6.0.1 numpy-1.25.0 portalocker-2.7.0 protobuf-4.23.3 pydantic-1.10.9 qdrant-client-1.3.1 typing-extensions-4.5.0 urllib3-1.26.16\r\n", - "Collecting wget\r\n", - " Using cached wget-3.2.zip (10 kB)\r\n", - " Preparing metadata (setup.py) ... \u001B[?25ldone\r\n", - "\u001B[?25hBuilding wheels for collected packages: wget\r\n", - " Building wheel for wget (setup.py) ... \u001B[?25ldone\r\n", - "\u001B[?25h Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=eb5f15f12150fc304e7b14973424f696fa8d95225772bc0cbc0b318bf92e04b9\r\n", - " Stored in directory: /home/user/.cache/pip/wheels/04/5f/3e/46cc37c5d698415694d83f607f833f83f0149e49b3af9d0f38\r\n", - "Successfully built wget\r\n", - "Installing collected packages: wget\r\n", - "Successfully installed wget-3.2\r\n" - ] - } - ], + "metadata": {}, + "outputs": [], "source": [ "# We'll need to install Qdrant client\n", - "!pip install qdrant-client\n", - "\n", - "#Install wget to pull zip file\n", - "!pip install wget" + "!pip install qdrant-client" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "5be94df6", "metadata": { "ExecuteTime": { - "end_time": "2023-06-29T13:00:32.715638041Z", - "start_time": "2023-06-29T13:00:31.654032435Z" + "end_time": "2024-05-21T23:49:06.926613Z", + "start_time": "2024-05-21T23:49:06.923221Z" } }, "outputs": [], "source": [ "import openai\n", - "\n", - "from typing import List, Iterator\n", "import pandas as pd\n", - "import numpy as np\n", - "import os\n", - "import wget\n", "from ast import literal_eval\n", + "import qdrant_client # Qdrant's client library for Python\n", "\n", - "# Qdrant's client library for Python\n", - "import qdrant_client\n", - "\n", - "# I've set this to our new embeddings model, this can be changed to the embedding model of your choice\n", - "EMBEDDING_MODEL = \"text-embedding-3-small\"\n", + "# This can be changed to the embedding model of your choice. Make sure its the same model that is used for generating embeddings\n", + "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", "\n", "# Ignore unclosed SSL socket warnings - optional in case you get these errors\n", "import warnings\n", @@ -125,41 +90,34 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "id": "5dff8b55", "metadata": { "ExecuteTime": { - "end_time": "2023-06-29T13:02:47.656128622Z", - "start_time": "2023-06-29T13:00:39.079229873Z" + "end_time": "2024-05-21T23:49:54.889503Z", + "start_time": "2024-05-21T23:49:41.132888Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "'vector_database_wikipedia_articles_embedded.zip'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ + "import requests\n", + "\n", "embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'\n", "\n", "# The file is ~700 MB so this will take some time\n", - "wget.download(embeddings_url)" + "response = requests.get(embeddings_url, verify=True) # Set verify=False to bypass SSL verification\n", + "with open('vector_database_wikipedia_articles_embedded.zip', 'wb') as file:\n", + " file.write(response.content)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "id": "21097972", "metadata": { "ExecuteTime": { - "end_time": "2023-06-29T13:03:08.268413005Z", - "start_time": "2023-06-29T13:02:47.626254476Z" + "end_time": "2024-05-21T23:50:56.268540Z", + "start_time": "2024-05-21T23:50:53.171125Z" } }, "outputs": [], @@ -171,12 +129,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "70bbd8ba", "metadata": { "ExecuteTime": { - "end_time": "2023-06-29T13:03:28.291797292Z", - "start_time": "2023-06-29T13:03:08.269033964Z" + "end_time": "2024-05-21T23:51:08.388674Z", + "start_time": "2024-05-21T23:50:57.592940Z" } }, "outputs": [], @@ -188,7 +146,12 @@ "cell_type": "code", "execution_count": 6, "id": "1721e45d", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T23:51:13.706819Z", + "start_time": "2024-05-21T23:51:13.700231Z" + } + }, "outputs": [ { "data": { @@ -305,7 +268,7 @@ "4 [0.021524671465158463, 0.018522677943110466, -... 4 " ] }, - "execution_count": 6, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } @@ -318,7 +281,12 @@ "cell_type": "code", "execution_count": 7, "id": "960b82af", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T23:55:20.588010Z", + "start_time": "2024-05-21T23:51:16.274336Z" + } + }, "outputs": [], "source": [ "# Read vectors from strings back into a list\n", @@ -333,7 +301,12 @@ "cell_type": "code", "execution_count": 8, "id": "a334ab8b", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T23:55:36.075327Z", + "start_time": "2024-05-21T23:55:36.038710Z" + } + }, "outputs": [ { "name": "stdout", @@ -395,13 +368,13 @@ "id": "76d697e9", "metadata": { "ExecuteTime": { - "end_time": "2023-01-18T09:28:38.928205Z", - "start_time": "2023-01-18T09:28:38.913987Z" + "end_time": "2024-05-21T23:55:56.550765Z", + "start_time": "2024-05-21T23:55:56.517724Z" } }, "outputs": [], "source": [ - "qdrant = qdrant_client.QdrantClient(host='localhost', prefer_grpc=True)" + "qdrant = qdrant_client.QdrantClient(host=\"localhost\", port=6333)" ] }, { @@ -410,18 +383,18 @@ "id": "1deeb539", "metadata": { "ExecuteTime": { - "end_time": "2023-01-18T09:29:19.806639Z", - "start_time": "2023-01-18T09:29:19.727897Z" + "end_time": "2024-05-21T23:55:57.340006Z", + "start_time": "2024-05-21T23:55:57.312830Z" } }, "outputs": [ { "data": { "text/plain": [ - "CollectionsResponse(collections=[CollectionDescription(name='Routines')])" + "CollectionsResponse(collections=[])" ] }, - "execution_count": 10, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -448,8 +421,8 @@ "id": "1a84ee1d", "metadata": { "ExecuteTime": { - "end_time": "2023-01-18T09:29:22.530121Z", - "start_time": "2023-01-18T09:29:22.524604Z" + "end_time": "2024-05-21T23:56:04.066640Z", + "start_time": "2024-05-21T23:56:04.064878Z" } }, "outputs": [], @@ -463,8 +436,8 @@ "id": "00876f92", "metadata": { "ExecuteTime": { - "end_time": "2023-01-18T09:31:14.413334Z", - "start_time": "2023-01-18T09:31:13.619079Z" + "end_time": "2024-05-21T23:56:05.462165Z", + "start_time": "2024-05-21T23:56:05.247948Z" } }, "outputs": [ @@ -474,7 +447,49 @@ "True" ] }, - "execution_count": 12, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the vector size from the first row to set up the collection\n", + "vector_size = len(article_df['content_vector'][0])\n", + "\n", + "# Set up the collection with the vector configuration. You need to declare the vector size and distance metric for the collection. Distance metric enables vector database to index and search vectors efficiently.\n", + "qdrant.recreate_collection(\n", + " collection_name='Articles',\n", + " vectors_config={\n", + " 'title': rest.VectorParams(\n", + " distance=rest.Distance.COSINE,\n", + " size=vector_size,\n", + " ),\n", + " 'content': rest.VectorParams(\n", + " distance=rest.Distance.COSINE,\n", + " size=vector_size,\n", + " ),\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9f39a8c395554ca3", + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T23:56:21.577594Z", + "start_time": "2024-05-21T23:56:21.460740Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -498,52 +513,67 @@ ] }, { - "cell_type": "code", - "execution_count": 13, - "id": "f24e76ab", - "metadata": { - "ExecuteTime": { - "end_time": "2023-01-18T09:36:28.597535Z", - "start_time": "2023-01-18T09:36:24.108867Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "UpdateResult(operation_id=0, status=)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "cell_type": "markdown", + "id": "e95be6e0c9af4c21", + "metadata": {}, "source": [ - "qdrant.upsert(\n", - " collection_name='Articles',\n", - " points=[\n", - " rest.PointStruct(\n", - " id=k,\n", - " vector={\n", - " 'title': v['title_vector'],\n", - " 'content': v['content_vector'],\n", - " },\n", - " payload=v.to_dict(),\n", - " )\n", - " for k, v in article_df.iterrows()\n", - " ],\n", - ")" + "In addition to the vector configuration defined under `vector`, we can also define the `payload` configuration. Payload is an optional field that allows you to store additional metadata alongside the vectors. In our case, we'll store the `id`, `title`, and `url` of the articles. As we return the title of nearest articles in the search results from payload, we can also provide the user with the URL to the article (which is part of the meta-data)." ] }, { "cell_type": "code", "execution_count": 14, + "id": "f24e76ab", + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T23:58:25.183855Z", + "start_time": "2024-05-21T23:56:50.664145Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Upserting articles: 100%|██████████| 25000/25000 [01:34<00:00, 264.52it/s]\n" + ] + } + ], + "source": [ + "from qdrant_client.models import PointStruct # Import the PointStruct to store the vector and payload\n", + "from tqdm import tqdm # Library to show the progress bar \n", + "\n", + "# Populate collection with vectors using tqdm to show progress\n", + "for k, v in tqdm(article_df.iterrows(), desc=\"Upserting articles\", total=len(article_df)):\n", + " try:\n", + " qdrant.upsert(\n", + " collection_name='Articles',\n", + " points=[\n", + " PointStruct(\n", + " id=k,\n", + " vector={'title': v['title_vector'], \n", + " 'content': v['content_vector']},\n", + " payload={\n", + " 'id': v['id'],\n", + " 'title': v['title'],\n", + " 'url': v['url']\n", + " }\n", + " )\n", + " ]\n", + " )\n", + " except Exception as e:\n", + " print(f\"Failed to upsert row {k}: {v}\")\n", + " print(f\"Exception: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "id": "d1188a12", "metadata": { "ExecuteTime": { - "end_time": "2023-01-18T09:58:13.825886Z", - "start_time": "2023-01-18T09:58:13.816248Z" + "end_time": "2024-05-21T23:58:27.558407Z", + "start_time": "2024-05-21T23:58:27.549740Z" } }, "outputs": [ @@ -553,7 +583,7 @@ "CountResult(count=25000)" ] }, - "execution_count": 14, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } @@ -575,12 +605,12 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "f1bac4ef", "metadata": { "ExecuteTime": { - "end_time": "2023-01-18T09:50:35.265647Z", - "start_time": "2023-01-18T09:50:35.256065Z" + "end_time": "2024-05-21T23:58:35.492725Z", + "start_time": "2024-05-21T23:58:35.488963Z" } }, "outputs": [], @@ -588,17 +618,18 @@ "def query_qdrant(query, collection_name, vector_name='title', top_k=20):\n", "\n", " # Creates embedding vector from user query\n", - " embedded_query = openai.Embedding.create(\n", + " embedded_query = openai.embeddings.create(\n", " input=query,\n", " model=EMBEDDING_MODEL,\n", - " )['data'][0]['embedding']\n", + " ).data[0].embedding # We take the first embedding from the list\n", " \n", " query_results = qdrant.search(\n", " collection_name=collection_name,\n", " query_vector=(\n", " vector_name, embedded_query\n", " ),\n", - " limit=top_k,\n", + " limit=top_k, \n", + " query_filter=None\n", " )\n", " \n", " return query_results" @@ -606,12 +637,12 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "aa92f3d3", "metadata": { "ExecuteTime": { - "end_time": "2023-01-18T09:50:46.545145Z", - "start_time": "2023-01-18T09:50:35.711020Z" + "end_time": "2024-05-21T23:58:37.183718Z", + "start_time": "2024-05-21T23:58:36.949491Z" } }, "outputs": [ @@ -619,43 +650,43 @@ "name": "stdout", "output_type": "stream", "text": [ - "1. Museum of Modern Art (Score: 0.875)\n", - "2. Western Europe (Score: 0.867)\n", - "3. Renaissance art (Score: 0.864)\n", - "4. Pop art (Score: 0.86)\n", - "5. Northern Europe (Score: 0.855)\n", - "6. Hellenistic art (Score: 0.853)\n", - "7. Modernist literature (Score: 0.847)\n", - "8. Art film (Score: 0.843)\n", - "9. Central Europe (Score: 0.843)\n", - "10. European (Score: 0.841)\n", - "11. Art (Score: 0.841)\n", - "12. Byzantine art (Score: 0.841)\n", - "13. Postmodernism (Score: 0.84)\n", - "14. Eastern Europe (Score: 0.839)\n", - "15. Europe (Score: 0.839)\n", - "16. Cubism (Score: 0.839)\n", - "17. Impressionism (Score: 0.838)\n", - "18. Bauhaus (Score: 0.838)\n", - "19. Expressionism (Score: 0.837)\n", - "20. Surrealism (Score: 0.837)\n" + "1. Museum of Modern Art, URL: https://simple.wikipedia.org/wiki/Museum%20of%20Modern%20Art (Score: 0.875)\n", + "2. Western Europe, URL: https://simple.wikipedia.org/wiki/Western%20Europe (Score: 0.867)\n", + "3. Renaissance art, URL: https://simple.wikipedia.org/wiki/Renaissance%20art (Score: 0.864)\n", + "4. Pop art, URL: https://simple.wikipedia.org/wiki/Pop%20art (Score: 0.86)\n", + "5. Northern Europe, URL: https://simple.wikipedia.org/wiki/Northern%20Europe (Score: 0.855)\n", + "6. Hellenistic art, URL: https://simple.wikipedia.org/wiki/Hellenistic%20art (Score: 0.853)\n", + "7. Modernist literature, URL: https://simple.wikipedia.org/wiki/Modernist%20literature (Score: 0.847)\n", + "8. Art film, URL: https://simple.wikipedia.org/wiki/Art%20film (Score: 0.843)\n", + "9. Central Europe, URL: https://simple.wikipedia.org/wiki/Central%20Europe (Score: 0.842)\n", + "10. European, URL: https://simple.wikipedia.org/wiki/European (Score: 0.841)\n", + "11. Art, URL: https://simple.wikipedia.org/wiki/Art (Score: 0.841)\n", + "12. Byzantine art, URL: https://simple.wikipedia.org/wiki/Byzantine%20art (Score: 0.841)\n", + "13. Postmodernism, URL: https://simple.wikipedia.org/wiki/Postmodernism (Score: 0.84)\n", + "14. Eastern Europe, URL: https://simple.wikipedia.org/wiki/Eastern%20Europe (Score: 0.839)\n", + "15. Cubism, URL: https://simple.wikipedia.org/wiki/Cubism (Score: 0.839)\n", + "16. Europe, URL: https://simple.wikipedia.org/wiki/Europe (Score: 0.839)\n", + "17. Impressionism, URL: https://simple.wikipedia.org/wiki/Impressionism (Score: 0.838)\n", + "18. Bauhaus, URL: https://simple.wikipedia.org/wiki/Bauhaus (Score: 0.838)\n", + "19. Surrealism, URL: https://simple.wikipedia.org/wiki/Surrealism (Score: 0.837)\n", + "20. Expressionism, URL: https://simple.wikipedia.org/wiki/Expressionism (Score: 0.837)\n" ] } ], "source": [ - "query_results = query_qdrant('modern art in Europe', 'Articles')\n", + "query_results = query_qdrant('modern art in Europe', 'Articles', 'title')\n", "for i, article in enumerate(query_results):\n", - " print(f'{i + 1}. {article.payload[\"title\"]} (Score: {round(article.score, 3)})')" + " print(f'{i + 1}. {article.payload[\"title\"]}, URL: {article.payload['url']} (Score: {round(article.score, 3)})')" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "7ed116b8", "metadata": { "ExecuteTime": { - "end_time": "2023-01-18T09:53:11.038910Z", - "start_time": "2023-01-18T09:52:55.248029Z" + "end_time": "2024-05-21T23:58:53.144123Z", + "start_time": "2024-05-21T23:58:52.924091Z" } }, "outputs": [ @@ -663,26 +694,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "1. Battle of Bannockburn (Score: 0.869)\n", - "2. Wars of Scottish Independence (Score: 0.861)\n", - "3. 1651 (Score: 0.853)\n", - "4. First War of Scottish Independence (Score: 0.85)\n", - "5. Robert I of Scotland (Score: 0.846)\n", - "6. 841 (Score: 0.844)\n", - "7. 1716 (Score: 0.844)\n", - "8. 1314 (Score: 0.837)\n", - "9. 1263 (Score: 0.836)\n", - "10. William Wallace (Score: 0.835)\n", - "11. Stirling (Score: 0.831)\n", - "12. 1306 (Score: 0.831)\n", - "13. 1746 (Score: 0.831)\n", - "14. 1040s (Score: 0.828)\n", - "15. 1106 (Score: 0.827)\n", - "16. 1304 (Score: 0.827)\n", - "17. David II of Scotland (Score: 0.825)\n", - "18. Braveheart (Score: 0.824)\n", - "19. 1124 (Score: 0.824)\n", - "20. July 27 (Score: 0.823)\n" + "1. Battle of Bannockburn, URL: https://simple.wikipedia.org/wiki/Battle%20of%20Bannockburn (Score: 0.869)\n", + "2. Wars of Scottish Independence, URL: https://simple.wikipedia.org/wiki/Wars%20of%20Scottish%20Independence (Score: 0.861)\n", + "3. 1651, URL: https://simple.wikipedia.org/wiki/1651 (Score: 0.852)\n", + "4. First War of Scottish Independence, URL: https://simple.wikipedia.org/wiki/First%20War%20of%20Scottish%20Independence (Score: 0.85)\n", + "5. Robert I of Scotland, URL: https://simple.wikipedia.org/wiki/Robert%20I%20of%20Scotland (Score: 0.846)\n", + "6. 841, URL: https://simple.wikipedia.org/wiki/841 (Score: 0.844)\n", + "7. 1716, URL: https://simple.wikipedia.org/wiki/1716 (Score: 0.844)\n", + "8. 1314, URL: https://simple.wikipedia.org/wiki/1314 (Score: 0.837)\n", + "9. 1263, URL: https://simple.wikipedia.org/wiki/1263 (Score: 0.836)\n", + "10. William Wallace, URL: https://simple.wikipedia.org/wiki/William%20Wallace (Score: 0.835)\n", + "11. Stirling, URL: https://simple.wikipedia.org/wiki/Stirling (Score: 0.831)\n", + "12. 1306, URL: https://simple.wikipedia.org/wiki/1306 (Score: 0.831)\n", + "13. 1746, URL: https://simple.wikipedia.org/wiki/1746 (Score: 0.83)\n", + "14. 1040s, URL: https://simple.wikipedia.org/wiki/1040s (Score: 0.828)\n", + "15. 1106, URL: https://simple.wikipedia.org/wiki/1106 (Score: 0.827)\n", + "16. 1304, URL: https://simple.wikipedia.org/wiki/1304 (Score: 0.826)\n", + "17. David II of Scotland, URL: https://simple.wikipedia.org/wiki/David%20II%20of%20Scotland (Score: 0.825)\n", + "18. Braveheart, URL: https://simple.wikipedia.org/wiki/Braveheart (Score: 0.824)\n", + "19. 1124, URL: https://simple.wikipedia.org/wiki/1124 (Score: 0.824)\n", + "20. July 27, URL: https://simple.wikipedia.org/wiki/July%2027 (Score: 0.823)\n" ] } ], @@ -690,13 +721,13 @@ "# This time we'll query using content vector\n", "query_results = query_qdrant('Famous battles in Scottish history', 'Articles', 'content')\n", "for i, article in enumerate(query_results):\n", - " print(f'{i + 1}. {article.payload[\"title\"]} (Score: {round(article.score, 3)})')" + " print(f'{i + 1}. {article.payload[\"title\"]}, URL: {article.payload['url']} (Score: {round(article.score, 3)})')" ] }, { "cell_type": "code", - "execution_count": null, - "id": "0119d87a", + "execution_count": 19, + "id": "cd4f750dc6daa2e8", "metadata": {}, "outputs": [], "source": []