mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
Add in web crawl Q&A tutorial
This commit is contained in:
parent
420c818ba1
commit
b87c10dbf7
80
solutions/web_crawl_Q&A/requirements.txt
Normal file
80
solutions/web_crawl_Q&A/requirements.txt
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
aiohttp==3.8.3
|
||||||
|
aiosignal==1.3.1
|
||||||
|
appnope==0.1.3
|
||||||
|
asttokens==2.2.1
|
||||||
|
async-timeout==4.0.2
|
||||||
|
attrs==22.2.0
|
||||||
|
backcall==0.2.0
|
||||||
|
beautifulsoup4==4.11.1
|
||||||
|
blobfile==2.0.1
|
||||||
|
bs4==0.0.1
|
||||||
|
certifi==2022.12.7
|
||||||
|
charset-normalizer==2.1.1
|
||||||
|
comm==0.1.2
|
||||||
|
contourpy==1.0.7
|
||||||
|
cycler==0.11.0
|
||||||
|
debugpy==1.6.5
|
||||||
|
decorator==5.1.1
|
||||||
|
docopt==0.6.2
|
||||||
|
entrypoints==0.4
|
||||||
|
executing==1.2.0
|
||||||
|
filelock==3.9.0
|
||||||
|
fonttools==4.38.0
|
||||||
|
frozenlist==1.3.3
|
||||||
|
html==1.13
|
||||||
|
huggingface-hub==0.11.1
|
||||||
|
idna==3.4
|
||||||
|
ipykernel==6.20.1
|
||||||
|
ipython==8.8.0
|
||||||
|
jedi==0.18.2
|
||||||
|
joblib==1.2.0
|
||||||
|
jupyter_client==7.4.8
|
||||||
|
jupyter_core==5.1.3
|
||||||
|
kiwisolver==1.4.4
|
||||||
|
lxml==4.9.2
|
||||||
|
matplotlib==3.6.3
|
||||||
|
matplotlib-inline==0.1.6
|
||||||
|
multidict==6.0.4
|
||||||
|
nest-asyncio==1.5.6
|
||||||
|
numpy==1.24.1
|
||||||
|
openai==0.26.1
|
||||||
|
packaging==23.0
|
||||||
|
pandas==1.5.2
|
||||||
|
parso==0.8.3
|
||||||
|
pexpect==4.8.0
|
||||||
|
pickleshare==0.7.5
|
||||||
|
Pillow==9.4.0
|
||||||
|
pipreqs==0.4.11
|
||||||
|
platformdirs==2.6.2
|
||||||
|
plotly==5.12.0
|
||||||
|
prompt-toolkit==3.0.36
|
||||||
|
psutil==5.9.4
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
pure-eval==0.2.2
|
||||||
|
pycryptodomex==3.17
|
||||||
|
Pygments==2.14.0
|
||||||
|
pyparsing==3.0.9
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2022.7.1
|
||||||
|
PyYAML==6.0
|
||||||
|
pyzmq==24.0.1
|
||||||
|
regex==2022.10.31
|
||||||
|
requests==2.28.1
|
||||||
|
scikit-learn==1.2.0
|
||||||
|
scipy==1.10.0
|
||||||
|
six==1.16.0
|
||||||
|
soupsieve==2.3.2.post1
|
||||||
|
stack-data==0.6.2
|
||||||
|
tenacity==8.1.0
|
||||||
|
threadpoolctl==3.1.0
|
||||||
|
tiktoken==0.1.2
|
||||||
|
tokenizers==0.13.2
|
||||||
|
tornado==6.2
|
||||||
|
tqdm==4.64.1
|
||||||
|
traitlets==5.8.1
|
||||||
|
transformers==4.25.1
|
||||||
|
typing_extensions==4.4.0
|
||||||
|
urllib3==1.26.13
|
||||||
|
wcwidth==0.2.5
|
||||||
|
yarg==0.1.9
|
||||||
|
yarl==1.8.2
|
1285
solutions/web_crawl_Q&A/web-qa.ipynb
Normal file
1285
solutions/web_crawl_Q&A/web-qa.ipynb
Normal file
File diff suppressed because one or more lines are too long
382
solutions/web_crawl_Q&A/web-qa.py
Normal file
382
solutions/web_crawl_Q&A/web-qa.py
Normal file
@ -0,0 +1,382 @@
|
|||||||
|
################################################################################
|
||||||
|
### Step 1
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import urllib.request
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from collections import deque
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import tiktoken
|
||||||
|
import openai
|
||||||
|
from openai.embeddings_utils import distances_from_embeddings
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
|
||||||
|
|
||||||
|
# Regex pattern to match a URL
|
||||||
|
HTTP_URL_PATTERN = r'^http[s]*://.+'
|
||||||
|
|
||||||
|
# Define root domain to crawl
|
||||||
|
domain = "openai.com"
|
||||||
|
full_url = "https://openai.com/"
|
||||||
|
|
||||||
|
# Create a class to parse the HTML and get the hyperlinks
|
||||||
|
class HyperlinkParser(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
# Create a list to store the hyperlinks
|
||||||
|
self.hyperlinks = []
|
||||||
|
|
||||||
|
# Override the HTMLParser's handle_starttag method to get the hyperlinks
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = dict(attrs)
|
||||||
|
|
||||||
|
# If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
|
||||||
|
if tag == "a" and "href" in attrs:
|
||||||
|
self.hyperlinks.append(attrs["href"])
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 2
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Function to get the hyperlinks from a URL
|
||||||
|
def get_hyperlinks(url):
|
||||||
|
|
||||||
|
# Try to open the URL and read the HTML
|
||||||
|
try:
|
||||||
|
# Open the URL and read the HTML
|
||||||
|
with urllib.request.urlopen(url) as response:
|
||||||
|
|
||||||
|
# If the response is not HTML, return an empty list
|
||||||
|
if not response.info().get('Content-Type').startswith("text/html"):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Decode the HTML
|
||||||
|
html = response.read().decode('utf-8')
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Create the HTML Parser and then Parse the HTML to get hyperlinks
|
||||||
|
parser = HyperlinkParser()
|
||||||
|
parser.feed(html)
|
||||||
|
|
||||||
|
return parser.hyperlinks
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 3
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Function to get the hyperlinks from a URL that are within the same domain
|
||||||
|
def get_domain_hyperlinks(local_domain, url):
|
||||||
|
clean_links = []
|
||||||
|
for link in set(get_hyperlinks(url)):
|
||||||
|
clean_link = None
|
||||||
|
|
||||||
|
# If the link is a URL, check if it is within the same domain
|
||||||
|
if re.search(HTTP_URL_PATTERN, link):
|
||||||
|
# Parse the URL and check if the domain is the same
|
||||||
|
url_obj = urlparse(link)
|
||||||
|
if url_obj.netloc == local_domain:
|
||||||
|
clean_link = link
|
||||||
|
|
||||||
|
# If the link is not a URL, check if it is a relative link
|
||||||
|
else:
|
||||||
|
if link.startswith("/"):
|
||||||
|
link = link[1:]
|
||||||
|
elif link.startswith("#") or link.startswith("mailto:"):
|
||||||
|
continue
|
||||||
|
clean_link = "https://" + local_domain + "/" + link
|
||||||
|
|
||||||
|
if clean_link is not None:
|
||||||
|
if clean_link.endswith("/"):
|
||||||
|
clean_link = clean_link[:-1]
|
||||||
|
clean_links.append(clean_link)
|
||||||
|
|
||||||
|
# Return the list of hyperlinks that are within the same domain
|
||||||
|
return list(set(clean_links))
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 4
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
def crawl(url):
|
||||||
|
# Parse the URL and get the domain
|
||||||
|
local_domain = urlparse(url).netloc
|
||||||
|
|
||||||
|
# Create a queue to store the URLs to crawl
|
||||||
|
queue = deque([url])
|
||||||
|
|
||||||
|
# Create a set to store the URLs that have already been seen (no duplicates)
|
||||||
|
seen = set([url])
|
||||||
|
|
||||||
|
# Create a directory to store the text files
|
||||||
|
if not os.path.exists("text/"):
|
||||||
|
os.mkdir("text/")
|
||||||
|
|
||||||
|
if not os.path.exists("text/"+local_domain+"/"):
|
||||||
|
os.mkdir("text/" + local_domain + "/")
|
||||||
|
|
||||||
|
# Create a directory to store the csv files
|
||||||
|
if not os.path.exists("processed"):
|
||||||
|
os.mkdir("processed")
|
||||||
|
|
||||||
|
# While the queue is not empty, continue crawling
|
||||||
|
while queue:
|
||||||
|
|
||||||
|
# Get the next URL from the queue
|
||||||
|
url = queue.pop()
|
||||||
|
print(url) # for debugging and to see the progress
|
||||||
|
|
||||||
|
# Save text from the url to a <url>.txt file
|
||||||
|
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:
|
||||||
|
|
||||||
|
# Get the text from the URL using BeautifulSoup
|
||||||
|
soup = BeautifulSoup(requests.get(url).text, "html.parser")
|
||||||
|
|
||||||
|
# Get the text but remove the tags
|
||||||
|
text = soup.get_text()
|
||||||
|
|
||||||
|
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
|
||||||
|
if ("You need to enable JavaScript to run this app." in text):
|
||||||
|
print("Unable to parse page " + url + " due to JavaScript being required")
|
||||||
|
|
||||||
|
# Otherwise, write the text to the file in the text directory
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
# Get the hyperlinks from the URL and add them to the queue
|
||||||
|
for link in get_domain_hyperlinks(local_domain, url):
|
||||||
|
if link not in seen:
|
||||||
|
queue.append(link)
|
||||||
|
seen.add(link)
|
||||||
|
|
||||||
|
crawl(full_url)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 5
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
def remove_newlines(serie):
|
||||||
|
serie = serie.str.replace('\n', ' ')
|
||||||
|
serie = serie.str.replace('\\n', ' ')
|
||||||
|
serie = serie.str.replace(' ', ' ')
|
||||||
|
serie = serie.str.replace(' ', ' ')
|
||||||
|
return serie
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 6
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Create a list to store the text files
|
||||||
|
texts=[]
|
||||||
|
|
||||||
|
# Get all the text files in the text directory
|
||||||
|
for file in os.listdir("text/" + domain + "/"):
|
||||||
|
|
||||||
|
# Open the file and read the text
|
||||||
|
with open("text/" + domain + "/" + file, "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
|
||||||
|
texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))
|
||||||
|
|
||||||
|
# Create a dataframe from the list of texts
|
||||||
|
df = pd.DataFrame(texts, columns = ['fname', 'text'])
|
||||||
|
|
||||||
|
# Set the text column to be the raw text with the newlines removed
|
||||||
|
df['text'] = df.fname + ". " + remove_newlines(df.text)
|
||||||
|
df.to_csv('processed/scraped.csv')
|
||||||
|
df.head()
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 7
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
|
||||||
|
tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||||
|
|
||||||
|
df = pd.read_csv('processed/scraped.csv', index_col=0)
|
||||||
|
df.columns = ['title', 'text']
|
||||||
|
|
||||||
|
# Tokenize the text and save the number of tokens to a new column
|
||||||
|
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
|
||||||
|
|
||||||
|
# Visualize the distribution of the number of tokens per row using a histogram
|
||||||
|
df.n_tokens.hist()
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 8
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
max_tokens = 500
|
||||||
|
|
||||||
|
# Function to split the text into chunks of a maximum number of tokens
|
||||||
|
def split_into_many(text, max_tokens = max_tokens):
|
||||||
|
|
||||||
|
# Split the text into sentences
|
||||||
|
sentences = text.split('. ')
|
||||||
|
|
||||||
|
# Get the number of tokens for each sentence
|
||||||
|
n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
tokens_so_far = 0
|
||||||
|
chunk = []
|
||||||
|
|
||||||
|
# Loop through the sentences and tokens joined together in a tuple
|
||||||
|
for sentence, token in zip(sentences, n_tokens):
|
||||||
|
|
||||||
|
# If the number of tokens so far plus the number of tokens in the current sentence is greater
|
||||||
|
# than the max number of tokens, then add the chunk to the list of chunks and reset
|
||||||
|
# the chunk and tokens so far
|
||||||
|
if tokens_so_far + token > max_tokens:
|
||||||
|
chunks.append(". ".join(chunk) + ".")
|
||||||
|
chunk = []
|
||||||
|
tokens_so_far = 0
|
||||||
|
|
||||||
|
# If the number of tokens in the current sentence is greater than the max number of
|
||||||
|
# tokens, go to the next sentence
|
||||||
|
if token > max_tokens:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Otherwise, add the sentence to the chunk and add the number of tokens to the total
|
||||||
|
chunk.append(sentence)
|
||||||
|
tokens_so_far += token + 1
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
shortened = []
|
||||||
|
|
||||||
|
# Loop through the dataframe
|
||||||
|
for row in df.iterrows():
|
||||||
|
|
||||||
|
# If the text is None, go to the next row
|
||||||
|
if row[1]['text'] is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If the number of tokens is greater than the max number of tokens, split the text into chunks
|
||||||
|
if row[1]['n_tokens'] > max_tokens:
|
||||||
|
shortened += split_into_many(row[1]['text'])
|
||||||
|
|
||||||
|
# Otherwise, add the text to the list of shortened texts
|
||||||
|
else:
|
||||||
|
shortened.append( row[1]['text'] )
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 9
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
df = pd.DataFrame(shortened, columns = ['text'])
|
||||||
|
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
|
||||||
|
df.n_tokens.hist()
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 10
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
|
||||||
|
df.to_csv('processed/embeddings.csv')
|
||||||
|
df.head()
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 11
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
df=pd.read_csv('processed/embeddings.csv', index_col=0)
|
||||||
|
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
|
||||||
|
|
||||||
|
df.head()
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 12
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
def create_context(
|
||||||
|
question, df, max_len=1800, size="ada"
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Create a context for a question by finding the most similar context from the dataframe
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Get the embeddings for the question
|
||||||
|
q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
|
||||||
|
|
||||||
|
# Get the distances from the embeddings
|
||||||
|
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
|
||||||
|
|
||||||
|
|
||||||
|
returns = []
|
||||||
|
cur_len = 0
|
||||||
|
|
||||||
|
# Sort by distance and add the text to the context until the context is too long
|
||||||
|
for i, row in df.sort_values('distances', ascending=True).iterrows():
|
||||||
|
|
||||||
|
# Add the length of the text to the current length
|
||||||
|
cur_len += row['n_tokens'] + 4
|
||||||
|
|
||||||
|
# If the context is too long, break
|
||||||
|
if cur_len > max_len:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Else add it to the text that is being returned
|
||||||
|
returns.append(row["text"])
|
||||||
|
|
||||||
|
# Return the context
|
||||||
|
return "\n\n###\n\n".join(returns)
|
||||||
|
|
||||||
|
def answer_question(
|
||||||
|
df,
|
||||||
|
model="text-davinci-003",
|
||||||
|
question="Am I allowed to publish model outputs to Twitter, without a human review?",
|
||||||
|
max_len=1800,
|
||||||
|
size="ada",
|
||||||
|
debug=False,
|
||||||
|
max_tokens=150,
|
||||||
|
stop_sequence=None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Answer a question based on the most similar context from the dataframe texts
|
||||||
|
"""
|
||||||
|
context = create_context(
|
||||||
|
question,
|
||||||
|
df,
|
||||||
|
max_len=max_len,
|
||||||
|
size=size,
|
||||||
|
)
|
||||||
|
# If debug, print the raw model response
|
||||||
|
if debug:
|
||||||
|
print("Context:\n" + context)
|
||||||
|
print("\n\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create a completions using the questin and context
|
||||||
|
response = openai.Completion.create(
|
||||||
|
prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
|
||||||
|
temperature=0,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
top_p=1,
|
||||||
|
frequency_penalty=0,
|
||||||
|
presence_penalty=0,
|
||||||
|
stop=stop_sequence,
|
||||||
|
model=model,
|
||||||
|
)
|
||||||
|
return response["choices"][0]["text"].strip()
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Step 12
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
print(answer_question(df, question="What day is it?", debug=False))
|
||||||
|
|
||||||
|
print(answer_question(df, question="What is our newest embeddings model?"))
|
Loading…
x
Reference in New Issue
Block a user