diff --git a/examples/vector_databases/Using_vector_databases_for_embeddings_search.ipynb b/examples/vector_databases/Using_vector_databases_for_embeddings_search.ipynb index 964cf35..23b8b0e 100644 --- a/examples/vector_databases/Using_vector_databases_for_embeddings_search.ipynb +++ b/examples/vector_databases/Using_vector_databases_for_embeddings_search.ipynb @@ -58,12 +58,15 @@ "# We'll need to install the clients for all vector databases\n", "!pip install pinecone-client\n", "!pip install weaviate-client\n", - "!pip install qdrant-client" + "!pip install qdrant-client\n", + "\n", + "#Install wget to pull zip file\n", + "!pip install wget" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "5be94df6", "metadata": {}, "outputs": [], @@ -71,14 +74,12 @@ "import openai\n", "\n", "import tiktoken\n", - "from tenacity import retry, wait_random_exponential, stop_after_attempt\n", "from typing import List, Iterator\n", - "import concurrent\n", - "from tqdm import tqdm\n", "import pandas as pd\n", - "from datasets import load_dataset\n", "import numpy as np\n", "import os\n", + "import wget\n", + "from ast import literal_eval\n", "\n", "# Pinecone's client library for Python\n", "import pinecone\n", @@ -106,287 +107,71 @@ "source": [ "## Load data\n", "\n", - "In this section we'll source the data for this task, embed it and format it for insertion into a vector database" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "bd99e08e", - "metadata": {}, - "outputs": [], - "source": [ - "# Simple function to take in a list of text objects and return them as a list of embeddings\n", - "def get_embeddings(input: List):\n", - " response = openai.Embedding.create(\n", - " input=input,\n", - " model=EMBEDDING_MODEL,\n", - " )[\"data\"]\n", - " return [data[\"embedding\"] for data in response]\n", - "\n", - "def batchify(iterable, n=1):\n", - " l = len(iterable)\n", - " for ndx in range(0, l, n):\n", - " yield iterable[ndx : min(ndx + n, l)]\n", - "\n", - "# Function for batching and parallel processing the embeddings\n", - "def embed_corpus(\n", - " corpus: List[str],\n", - " batch_size=64,\n", - " num_workers=8,\n", - " max_context_len=8191,\n", - "):\n", - "\n", - " # Encode the corpus, truncating to max_context_len\n", - " encoding = tiktoken.get_encoding(\"cl100k_base\")\n", - " encoded_corpus = [\n", - " encoded_article[:max_context_len] for encoded_article in encoding.encode_batch(corpus)\n", - " ]\n", - "\n", - " # Calculate corpus statistics: the number of inputs, the total number of tokens, and the estimated cost to embed\n", - " num_tokens = sum(len(article) for article in encoded_corpus)\n", - " cost_to_embed_tokens = num_tokens / 1_000 * 0.0004\n", - " print(\n", - " f\"num_articles={len(encoded_corpus)}, num_tokens={num_tokens}, est_embedding_cost={cost_to_embed_tokens:.2f} USD\"\n", - " )\n", - "\n", - " # Embed the corpus\n", - " with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:\n", - " \n", - " futures = [\n", - " executor.submit(get_embeddings, text_batch)\n", - " for text_batch in batchify(encoded_corpus, batch_size)\n", - " ]\n", - "\n", - " with tqdm(total=len(encoded_corpus)) as pbar:\n", - " for _ in concurrent.futures.as_completed(futures):\n", - " pbar.update(batch_size)\n", - "\n", - " embeddings = []\n", - " for future in futures:\n", - " data = future.result()\n", - " embeddings.extend(data)\n", - "\n", - " return embeddings" + "In this section we'll load embedded data that we've prepared previous to this session." ] }, { "cell_type": "code", "execution_count": null, - "id": "0c1c73cb", + "id": "5dff8b55", "metadata": {}, "outputs": [], "source": [ - "# We'll use the datasets library to pull the Simple Wikipedia dataset for embedding\n", - "dataset = list(load_dataset(\"wikipedia\", \"20220301.simple\")[\"train\"])\n", - "# Limited to 25k articles for demo purposes\n", - "dataset = dataset[:25_000] " + "embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'\n", + "\n", + "# Warning, the file is pretty big so this will take some time\n", + "wget.download(embeddings_url)" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "e6ee90ce", + "execution_count": null, + "id": "21097972", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "num_articles=25000, num_tokens=12896881, est_embedding_cost=5.16 USD\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "25024it [01:06, 377.31it/s] " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 16.3 s, sys: 2.24 s, total: 18.5 s\n", - "Wall time: 1min 8s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ - "%%time\n", - "# Embed the article text\n", - "dataset_embeddings = embed_corpus([article[\"text\"] for article in dataset])" + "import zipfile\n", + "with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\"r\") as zip_ref:\n", + " zip_ref.extractall(\"../data\")\n", + " \n", + "article_df = pd.read_csv('../data/vector_database_wikipedia_articles_embedded.csv')" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "850c7215", + "execution_count": null, + "id": "1721e45d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "num_articles=25000, num_tokens=88300, est_embedding_cost=0.04 USD\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "25024it [00:36, 683.22it/s] \n" - ] - } - ], + "outputs": [], "source": [ - "# Embed the article titles separately\n", - "title_embeddings = embed_corpus([article[\"title\"] for article in dataset])" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "id": "1410daaa", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idurltitletexttitle_vectorcontent_vectorvector_id
01https://simple.wikipedia.org/wiki/AprilAprilApril is the fourth month of the year in the J...[0.001009464613161981, -0.020700545981526375, ...[-0.011253940872848034, -0.013491976074874401,...0
12https://simple.wikipedia.org/wiki/AugustAugustAugust (Aug.) is the eighth month of the year ...[0.0009286514250561595, 0.000820168002974242, ...[0.0003609954728744924, 0.007262262050062418, ...1
26https://simple.wikipedia.org/wiki/ArtArtArt is a creative activity that expresses imag...[0.003393713850528002, 0.0061537534929811954, ...[-0.004959689453244209, 0.015772193670272827, ...2
38https://simple.wikipedia.org/wiki/AAA or a is the first letter of the English alph...[0.0153952119871974, -0.013759135268628597, 0....[0.024894846603274345, -0.022186409682035446, ...3
49https://simple.wikipedia.org/wiki/AirAirAir refers to the Earth's atmosphere. Air is a...[0.02224554680287838, -0.02044147066771984, -0...[0.021524671465158463, 0.018522677943110466, -...4
\n", - "
" - ], - "text/plain": [ - " id url title \\\n", - "0 1 https://simple.wikipedia.org/wiki/April April \n", - "1 2 https://simple.wikipedia.org/wiki/August August \n", - "2 6 https://simple.wikipedia.org/wiki/Art Art \n", - "3 8 https://simple.wikipedia.org/wiki/A A \n", - "4 9 https://simple.wikipedia.org/wiki/Air Air \n", - "\n", - " text \\\n", - "0 April is the fourth month of the year in the J... \n", - "1 August (Aug.) is the eighth month of the year ... \n", - "2 Art is a creative activity that expresses imag... \n", - "3 A or a is the first letter of the English alph... \n", - "4 Air refers to the Earth's atmosphere. Air is a... \n", - "\n", - " title_vector \\\n", - "0 [0.001009464613161981, -0.020700545981526375, ... \n", - "1 [0.0009286514250561595, 0.000820168002974242, ... \n", - "2 [0.003393713850528002, 0.0061537534929811954, ... \n", - "3 [0.0153952119871974, -0.013759135268628597, 0.... \n", - "4 [0.02224554680287838, -0.02044147066771984, -0... \n", - "\n", - " content_vector vector_id \n", - "0 [-0.011253940872848034, -0.013491976074874401,... 0 \n", - "1 [0.0003609954728744924, 0.007262262050062418, ... 1 \n", - "2 [-0.004959689453244209, 0.015772193670272827, ... 2 \n", - "3 [0.024894846603274345, -0.022186409682035446, ... 3 \n", - "4 [0.021524671465158463, 0.018522677943110466, -... 4 " - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# We will then store the result in another dataframe, and prep the data for insertion into a vector DB\n", - "article_df = pd.DataFrame(dataset)\n", - "article_df['title_vector'] = title_embeddings\n", - "article_df['content_vector'] = dataset_embeddings\n", - "article_df['vector_id'] = article_df.index\n", - "article_df['vector_id'] = article_df['vector_id'].apply(str)\n", "article_df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "960b82af", + "metadata": {}, + "outputs": [], + "source": [ + "# Read vectors from strings back into a list\n", + "#article_df['title_vector'] = article_df.title_vector.apply(literal_eval)\n", + "article_df['content_vector'] = article_df.content_vector.apply(literal_eval)\n", + "\n", + "# Set vector_id to be a string\n", + "article_df['vector_id'] = article_df['vector_id'].apply(str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a334ab8b", + "metadata": {}, + "outputs": [], + "source": [ + "len(article_df['title_vector'][0])" + ] + }, { "cell_type": "markdown", "id": "ed32fc87", @@ -406,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "92e6152a", "metadata": {}, "outputs": [], @@ -429,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": null, "id": "0a71c575", "metadata": {}, "outputs": [], @@ -461,21 +246,10 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": null, "id": "7ea9ad46", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['wikipedia-articles']" - ] - }, - "execution_count": 124, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Pick a name for the new index\n", "index_name = 'wikipedia-articles'\n", @@ -494,18 +268,10 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": null, "id": "5daeba00", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploading vectors to content namespace..\n" - ] - } - ], + "outputs": [], "source": [ "# Upsert content vectors in content namespace - this can take a few minutes\n", "print(\"Uploading vectors to content namespace..\")\n", @@ -515,18 +281,10 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": null, "id": "5fc1b083", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploading vectors to title namespace..\n" - ] - } - ], + "outputs": [], "source": [ "# Upsert title vectors in title namespace - this can also take a few minutes\n", "print(\"Uploading vectors to title namespace..\")\n", @@ -536,25 +294,10 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": null, "id": "f90c7fba", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'dimension': 1536,\n", - " 'index_fullness': 0.1,\n", - " 'namespaces': {'content': {'vector_count': 25000},\n", - " 'title': {'vector_count': 25000}},\n", - " 'total_vector_count': 50000}" - ] - }, - "execution_count": 128, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Check index size for each namespace to confirm all of our docs have loaded\n", "index.describe_index_stats()" @@ -584,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": null, "id": "3c8c2aa1", "metadata": {}, "outputs": [], @@ -681,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": null, "id": "b9ea472d", "metadata": {}, "outputs": [], @@ -691,21 +434,10 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": null, "id": "13be220d", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'classes': []}" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "client.schema.delete_all()\n", "client.schema.get()" @@ -713,21 +445,10 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": null, "id": "73d33184", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 115, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "client.is_ready()" ] @@ -748,53 +469,10 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": null, "id": "e868d143", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'classes': [{'class': 'Article',\n", - " 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},\n", - " 'cleanupIntervalSeconds': 60,\n", - " 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},\n", - " 'properties': [{'dataType': ['text'],\n", - " 'description': 'Title of the article',\n", - " 'name': 'title',\n", - " 'tokenization': 'word'},\n", - " {'dataType': ['text'],\n", - " 'description': 'Contents of the article',\n", - " 'name': 'content',\n", - " 'tokenization': 'word'}],\n", - " 'shardingConfig': {'virtualPerPhysical': 128,\n", - " 'desiredCount': 1,\n", - " 'actualCount': 1,\n", - " 'desiredVirtualCount': 128,\n", - " 'actualVirtualCount': 128,\n", - " 'key': '_id',\n", - " 'strategy': 'hash',\n", - " 'function': 'murmur3'},\n", - " 'vectorIndexConfig': {'skip': False,\n", - " 'cleanupIntervalSeconds': 300,\n", - " 'maxConnections': 64,\n", - " 'efConstruction': 128,\n", - " 'ef': -1,\n", - " 'dynamicEfMin': 100,\n", - " 'dynamicEfMax': 500,\n", - " 'dynamicEfFactor': 8,\n", - " 'vectorCacheMaxObjects': 2000000,\n", - " 'flatSearchCutoff': 40000,\n", - " 'distance': 'cosine'},\n", - " 'vectorIndexType': 'hnsw',\n", - " 'vectorizer': 'none'}]}" - ] - }, - "execution_count": 116, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "class_obj = {\n", " \"class\": \"Article\",\n", @@ -820,18 +498,10 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": null, "id": "786d437f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploading vectors to article schema..\n" - ] - } - ], + "outputs": [], "source": [ "# Convert DF into a list of tuples\n", "data_objects = []\n", @@ -861,49 +531,10 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": null, "id": "3658693c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Kim Jong-nam\n", - "Kim Jong-nam (May 10, 1971 - February 13, 2017) was the eldest son of Kim Jong-il, the former leader of North Korea.\n", - "\n", - "He tried to enter Japan using a fake passport in May 2001. This was to visit Disneyland. This caused his father to not approve of him. Kim Jong-nam's younger half-brother Kim Jong-un was made the heir in September 2010.\n", - "\n", - "In June 2010, Kim Jong-nam gave a brief interview to the Associated Press in Macau. He told the reporter that he had \"no plans\" to defect to Europe. The press had recently said this. Kim Jong-nam lived in an apartment on the southern tip of Macau's Coloane Island until 2007. An anonymous South Korean official reported in October 2010 that Jong-nam had not lived in Macau for \"months\", and now goes between China and \"another country.\"\n", - "\n", - "When his father died, Kim Jong-nam did not attend the funeral. This was to avoid rumours on the succession.\n", - "\n", - "He was assassinated in Malaysia on February 13, 2017, which is believed to be ordered by his half-brother Kim Jong-un.\n", - "\n", - "Personal life\n", - "The South Korean newspaper The Chosun Ilbo said that Kim Jong-nam has two wives, at least one mistress, and several children. His first wife Shin Jong-hui (born c. 1980) and their son Kum-sol (born c. 1996) live at a home called Dragon Villa on the northern outskirts of Beijing. His second wife Lee Hye-kyong (born c. 1970), their son Han-sol (born c. 1995) and their daughter Sol-hui (born c. 1998) live in an apartment building in Macau. Jong-nam's mistress, former Air Koryo flight attendant So Yong-la (born c. 1980), also lives in Macau. \n", - "\n", - "Jong-nam is often given attention by the media for his gambling and extravagant spending.\n", - "\n", - "References\n", - "\n", - "1971 births\n", - "2017 deaths\n", - "Assassinated people\n", - "North Korean politicians\n" - ] - }, - { - "data": { - "text/plain": [ - "{'Aggregate': {'Article': [{'meta': {'count': 25000}}]}}" - ] - }, - "execution_count": 118, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Test our insert has worked by checking one object\n", "print(client.data_object.get()['objects'][0]['properties']['title'])\n", @@ -928,7 +559,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": null, "id": "5acd5437", "metadata": {}, "outputs": [], @@ -954,37 +585,10 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, "id": "15def653", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1. Museum of Modern Art (Score: 0.938)\n", - "2. Western Europe (Score: 0.934)\n", - "3. Renaissance art (Score: 0.932)\n", - "4. Pop art (Score: 0.93)\n", - "5. Northern Europe (Score: 0.927)\n", - "6. Hellenistic art (Score: 0.926)\n", - "7. Modernist literature (Score: 0.924)\n", - "8. Art film (Score: 0.922)\n", - "9. Central Europe (Score: 0.921)\n", - "10. Art (Score: 0.921)\n", - "11. European (Score: 0.921)\n", - "12. Byzantine art (Score: 0.92)\n", - "13. Postmodernism (Score: 0.92)\n", - "14. Eastern Europe (Score: 0.92)\n", - "15. Cubism (Score: 0.92)\n", - "16. Europe (Score: 0.919)\n", - "17. Impressionism (Score: 0.919)\n", - "18. Bauhaus (Score: 0.919)\n", - "19. Surrealism (Score: 0.919)\n", - "20. Expressionism (Score: 0.919)\n" - ] - } - ], + "outputs": [], "source": [ "query_result = query_weaviate('modern art in Europe','Article')\n", "counter = 0\n", @@ -995,37 +599,10 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": null, "id": "93c4a696", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1. Historic Scotland (Score: 0.946)\n", - "2. First War of Scottish Independence (Score: 0.946)\n", - "3. Battle of Bannockburn (Score: 0.946)\n", - "4. Wars of Scottish Independence (Score: 0.944)\n", - "5. Second War of Scottish Independence (Score: 0.939)\n", - "6. List of Scottish monarchs (Score: 0.937)\n", - "7. Scottish Borders (Score: 0.932)\n", - "8. Braveheart (Score: 0.929)\n", - "9. John of Scotland (Score: 0.929)\n", - "10. Guardians of Scotland (Score: 0.926)\n", - "11. Holyrood Abbey (Score: 0.925)\n", - "12. Scottish (Score: 0.925)\n", - "13. Scots (Score: 0.925)\n", - "14. Robert I of Scotland (Score: 0.924)\n", - "15. Scottish people (Score: 0.924)\n", - "16. Alexander I of Scotland (Score: 0.924)\n", - "17. Edinburgh Castle (Score: 0.924)\n", - "18. Robert Burns (Score: 0.923)\n", - "19. Battle of Bosworth Field (Score: 0.922)\n", - "20. David II of Scotland (Score: 0.922)\n" - ] - } - ], + "outputs": [], "source": [ "query_result = query_weaviate('Famous battles in Scottish history','Article')\n", "counter = 0\n", @@ -1063,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": null, "id": "76d697e9", "metadata": { "ExecuteTime": { @@ -1078,7 +655,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": null, "id": "1deeb539", "metadata": { "ExecuteTime": { @@ -1086,18 +663,7 @@ "start_time": "2023-01-18T09:29:19.727897Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "CollectionsResponse(collections=[])" - ] - }, - "execution_count": 100, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "qdrant.get_collections()" ] @@ -1116,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": null, "id": "1a84ee1d", "metadata": { "ExecuteTime": { @@ -1131,7 +697,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": null, "id": "00876f92", "metadata": { "ExecuteTime": {