mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
Initial commit of simple app demonstrating chatbot starting point in notebook and Streamlit
This commit is contained in:
parent
c3b920f123
commit
771f108f77
35
apps/chatbot-kickstarter/README.md
Normal file
35
apps/chatbot-kickstarter/README.md
Normal file
@ -0,0 +1,35 @@
|
||||
# Powering your products with ChatGPT and your own data
|
||||
|
||||
The Chatbot Kickstarter is a starter repo to get you used to building basic a basic Chatbot using the ChatGPT API and your own knowledge base. The flow you're taken through was originally presented with [these slides](https://drive.google.com/file/d/1dB-RQhZC_Q1iAsHkNNdkqtxxXqYODFYy/view?usp=share_link), which may come in useful to refer to.
|
||||
|
||||
This repo contains one notebook and two basic Streamlit apps:
|
||||
- `powering_your_products_with_chatgpt_and_your_data.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, and building simple Q&A and Chatbot functionality on top.
|
||||
- `search.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
|
||||
- `chat.py`: A Streamlit app providing a simple Chatbot via a search bar to query your knowledge base.
|
||||
|
||||
To run either version of the app, please follow the instructions in the respective README.md files in the subdirectories.
|
||||
|
||||
## How it works
|
||||
|
||||
The notebook is the best place to start, and is broadly laid out as follows:
|
||||
- **Lay the foundations:**
|
||||
- Set up the vector database to accept vectors and data
|
||||
- Load the dataset, chunk the data up for embedding and store in the vector database
|
||||
- **Make it a product:**
|
||||
- Add a retrieval step where users provide queries and we return the most relevant entries
|
||||
- Summarise search results with GPT-3
|
||||
- Test out this basic Q&A app in Streamlit
|
||||
- **Build your moat:**
|
||||
- Create an Assistant class to manage context and interact with our bot
|
||||
- Use the Chatbot to answer questions using semantic search context
|
||||
- Test out this basic Chatbot app in Streamlit
|
||||
|
||||
Once you've run the notebook and tried the two Streamlit apps, you should be in a position to strip out any useful snippets and start your own Q&A or Chat application.
|
||||
|
||||
## Limitations
|
||||
|
||||
- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
|
||||
- This is a simple starting point - if you hit issues deploying your use case you may need to tune (non-exhaustive list):
|
||||
- The prompt and parameters for the model for it to answer accurately
|
||||
- Your search to return more relevant results
|
||||
- Your chunking/embedding approach to store the most relevant content effectively for retrieval
|
83
apps/chatbot-kickstarter/chat.py
Normal file
83
apps/chatbot-kickstarter/chat.py
Normal file
@ -0,0 +1,83 @@
|
||||
import streamlit as st
|
||||
from streamlit_chat import message
|
||||
|
||||
from database import get_redis_connection
|
||||
from chatbot import RetrievalAssistant, Message
|
||||
|
||||
# Initialise database
|
||||
|
||||
## Initialise Redis connection
|
||||
redis_client = get_redis_connection()
|
||||
|
||||
# Set instruction
|
||||
|
||||
# System prompt requiring Question and Year to be extracted from the user
|
||||
system_prompt = '''
|
||||
You are a helpful Formula 1 knowledge base assistant. You need to capture a Question and Year from each customer.
|
||||
The Question is their query on Formula 1, and the Year is the year of the applicable Formula 1 season.
|
||||
Think about this step by step:
|
||||
- The user will ask a Question
|
||||
- You will ask them for the Year if their question didn't include a Year
|
||||
- Once you have the Year, say "searching for answers".
|
||||
|
||||
Example:
|
||||
|
||||
User: I'd like to know the cost cap for a power unit
|
||||
|
||||
Assistant: Certainly, what year would you like this for?
|
||||
|
||||
User: 2023 please.
|
||||
|
||||
Assistant: Searching for answers.
|
||||
'''
|
||||
|
||||
### CHATBOT APP
|
||||
|
||||
st.set_page_config(
|
||||
page_title="Streamlit Chat - Demo",
|
||||
page_icon=":robot:"
|
||||
)
|
||||
|
||||
st.title('Formula 1 Chatbot')
|
||||
st.subheader("Help us help you learn about Formula 1")
|
||||
|
||||
if 'generated' not in st.session_state:
|
||||
st.session_state['generated'] = []
|
||||
|
||||
if 'past' not in st.session_state:
|
||||
st.session_state['past'] = []
|
||||
|
||||
def query(question):
|
||||
response = st.session_state['chat'].ask_assistant(question)
|
||||
return response
|
||||
|
||||
prompt = st.text_input("What do you want to know: ","", key="input")
|
||||
|
||||
if st.button('Submit', key='generationSubmit'):
|
||||
|
||||
# Initialization
|
||||
if 'chat' not in st.session_state:
|
||||
st.session_state['chat'] = RetrievalAssistant()
|
||||
messages = []
|
||||
system_message = Message('system',system_prompt)
|
||||
messages.append(system_message.message())
|
||||
else:
|
||||
messages = []
|
||||
|
||||
|
||||
user_message = Message('user',prompt)
|
||||
messages.append(user_message.message())
|
||||
|
||||
response = query(messages)
|
||||
|
||||
# Debugging step to print the whole response
|
||||
#st.write(response)
|
||||
|
||||
st.session_state.past.append(prompt)
|
||||
st.session_state.generated.append(response['content'])
|
||||
|
||||
if st.session_state['generated']:
|
||||
|
||||
for i in range(len(st.session_state['generated'])-1, -1, -1):
|
||||
message(st.session_state["generated"][i], key=str(i))
|
||||
message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
|
84
apps/chatbot-kickstarter/chatbot.py
Normal file
84
apps/chatbot-kickstarter/chatbot.py
Normal file
@ -0,0 +1,84 @@
|
||||
import openai
|
||||
from termcolor import colored
|
||||
import streamlit as st
|
||||
|
||||
from database import get_redis_connection,get_redis_results
|
||||
|
||||
from config import CHAT_MODEL,COMPLETIONS_MODEL, INDEX_NAME
|
||||
|
||||
redis_client = get_redis_connection()
|
||||
|
||||
# A basic class to create a message as a dict for chat
|
||||
class Message:
|
||||
|
||||
|
||||
def __init__(self,role,content):
|
||||
|
||||
self.role = role
|
||||
self.content = content
|
||||
|
||||
def message(self):
|
||||
|
||||
return {"role": self.role,"content": self.content}
|
||||
|
||||
# New Assistant class to add a vector database call to its responses
|
||||
class RetrievalAssistant:
|
||||
|
||||
def __init__(self):
|
||||
self.conversation_history = []
|
||||
|
||||
def _get_assistant_response(self, prompt):
|
||||
|
||||
try:
|
||||
completion = openai.ChatCompletion.create(
|
||||
model=CHAT_MODEL,
|
||||
messages=prompt,
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
response_message = Message(completion['choices'][0]['message']['role'],completion['choices'][0]['message']['content'])
|
||||
return response_message.message()
|
||||
|
||||
except Exception as e:
|
||||
|
||||
return f'Request failed with exception {e}'
|
||||
|
||||
# The function to retrieve Redis search results
|
||||
def _get_search_results(self,prompt):
|
||||
latest_question = prompt
|
||||
search_content = get_redis_results(redis_client,latest_question,INDEX_NAME)['result'][0]
|
||||
return search_content
|
||||
|
||||
|
||||
def ask_assistant(self, next_user_prompt):
|
||||
[self.conversation_history.append(x) for x in next_user_prompt]
|
||||
assistant_response = self._get_assistant_response(self.conversation_history)
|
||||
|
||||
# Answer normally unless the trigger sequence is used "searching_for_answers"
|
||||
if 'searching for answers' in assistant_response['content'].lower():
|
||||
question_extract = openai.Completion.create(model=COMPLETIONS_MODEL,prompt=f"Extract the user's latest question and the year for that question from this conversation: {self.conversation_history}. Extract it as a sentence stating the Question and Year")
|
||||
search_result = self._get_search_results(question_extract['choices'][0]['text'])
|
||||
|
||||
# We insert an extra system prompt here to give fresh context to the Chatbot on how to use the Redis results
|
||||
# In this instance we add it to the conversation history, but in production it may be better to hide
|
||||
self.conversation_history.insert(-1,{"role": 'system',"content": f"Answer the user's question using this content: {search_result}. If you cannot answer the question, say 'Sorry, I don't know the answer to this one'"})
|
||||
|
||||
assistant_response = self._get_assistant_response(self.conversation_history)
|
||||
|
||||
self.conversation_history.append(assistant_response)
|
||||
return assistant_response
|
||||
else:
|
||||
self.conversation_history.append(assistant_response)
|
||||
return assistant_response
|
||||
|
||||
|
||||
def pretty_print_conversation_history(self, colorize_assistant_replies=True):
|
||||
for entry in self.conversation_history:
|
||||
if entry['role'] == 'system':
|
||||
pass
|
||||
else:
|
||||
prefix = entry['role']
|
||||
content = entry['content']
|
||||
output = colored(prefix +':\n' + content, 'green') if colorize_assistant_replies and entry['role'] == 'assistant' else prefix +':\n' + content
|
||||
#prefix = entry['role']
|
||||
print(output)
|
7
apps/chatbot-kickstarter/config.py
Normal file
7
apps/chatbot-kickstarter/config.py
Normal file
@ -0,0 +1,7 @@
|
||||
COMPLETIONS_MODEL = "text-davinci-003"
|
||||
EMBEDDINGS_MODEL = "text-embedding-ada-002"
|
||||
CHAT_MODEL = 'gpt-3.5-turbo'
|
||||
TEXT_EMBEDDING_CHUNK_SIZE=300
|
||||
VECTOR_FIELD_NAME='content_vector'
|
||||
PREFIX = "sportsdoc"
|
||||
INDEX_NAME = "f1-index"
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
82
apps/chatbot-kickstarter/database.py
Normal file
82
apps/chatbot-kickstarter/database.py
Normal file
@ -0,0 +1,82 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import openai
|
||||
from redis import Redis
|
||||
from redis.commands.search.field import VectorField
|
||||
from redis.commands.search.field import TextField, NumericField
|
||||
from redis.commands.search.query import Query
|
||||
|
||||
from config import EMBEDDINGS_MODEL, PREFIX, VECTOR_FIELD_NAME
|
||||
|
||||
# Get a Redis connection
|
||||
def get_redis_connection(host='localhost',port='6379',db=0):
|
||||
|
||||
r = Redis(host=host, port=port, db=db,decode_responses=False)
|
||||
return r
|
||||
|
||||
# Create a Redis index to hold our data
|
||||
def create_hnsw_index (redis_conn,vector_field_name,vector_dimensions=1536, distance_metric='COSINE'):
|
||||
redis_conn.ft().create_index([
|
||||
VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric}),
|
||||
TextField("filename"),
|
||||
TextField("text_chunk"),
|
||||
NumericField("file_chunk_index")
|
||||
])
|
||||
|
||||
# Create a Redis pipeline to load all the vectors and their metadata
|
||||
def load_vectors(client:Redis, input_list, vector_field_name):
|
||||
p = client.pipeline(transaction=False)
|
||||
for text in input_list:
|
||||
#hash key
|
||||
key=f"{PREFIX}:{text['id']}"
|
||||
|
||||
#hash values
|
||||
item_metadata = text['metadata']
|
||||
#
|
||||
item_keywords_vector = np.array(text['vector'],dtype= 'float32').tobytes()
|
||||
item_metadata[vector_field_name]=item_keywords_vector
|
||||
|
||||
# HSET
|
||||
p.hset(key,mapping=item_metadata)
|
||||
|
||||
p.execute()
|
||||
|
||||
# Make query to Redis
|
||||
def query_redis(redis_conn,query,index_name, top_k=2):
|
||||
|
||||
|
||||
|
||||
## Creates embedding vector from user query
|
||||
embedded_query = np.array(openai.Embedding.create(
|
||||
input=query,
|
||||
model=EMBEDDINGS_MODEL,
|
||||
)["data"][0]['embedding'], dtype=np.float32).tobytes()
|
||||
|
||||
#prepare the query
|
||||
q = Query(f'*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]').sort_by('vector_score').paging(0,top_k).return_fields('vector_score','filename','text_chunk','text_chunk_index').dialect(2)
|
||||
params_dict = {"vec_param": embedded_query}
|
||||
|
||||
|
||||
#Execute the query
|
||||
results = redis_conn.ft(index_name).search(q, query_params = params_dict)
|
||||
|
||||
return results
|
||||
|
||||
# Get mapped documents from Weaviate results
|
||||
def get_redis_results(redis_conn,query,index_name):
|
||||
|
||||
# Get most relevant documents from Redis
|
||||
query_result = query_redis(redis_conn,query,index_name)
|
||||
|
||||
# Extract info into a list
|
||||
query_result_list = []
|
||||
for i, result in enumerate(query_result.docs):
|
||||
result_order = i
|
||||
text = result.text_chunk
|
||||
score = result.vector_score
|
||||
query_result_list.append((result_order,text,score))
|
||||
|
||||
# Display result as a DataFrame for ease of us
|
||||
result_df = pd.DataFrame(query_result_list)
|
||||
result_df.columns = ['id','result','certainty']
|
||||
return result_df
|
File diff suppressed because it is too large
Load Diff
11
apps/chatbot-kickstarter/requirements.txt
Normal file
11
apps/chatbot-kickstarter/requirements.txt
Normal file
@ -0,0 +1,11 @@
|
||||
numpy==1.24.2
|
||||
openai==0.27.1
|
||||
pandas==1.5.3
|
||||
redis==4.5.1
|
||||
requests==2.28.2
|
||||
streamlit==1.20.0
|
||||
streamlit_chat==0.0.2.2
|
||||
termcolor==2.2.0
|
||||
jupyter
|
||||
ipykernel
|
||||
textract
|
39
apps/chatbot-kickstarter/search.py
Normal file
39
apps/chatbot-kickstarter/search.py
Normal file
@ -0,0 +1,39 @@
|
||||
import streamlit as st
|
||||
import openai
|
||||
|
||||
from database import get_redis_connection, get_redis_results
|
||||
from config import INDEX_NAME, COMPLETIONS_MODEL
|
||||
|
||||
# initialise Redis connection
|
||||
|
||||
client = get_redis_connection()
|
||||
|
||||
### SEARCH APP
|
||||
|
||||
st.set_page_config(
|
||||
page_title="Streamlit Search - Demo",
|
||||
page_icon=":robot:"
|
||||
)
|
||||
|
||||
st.title('Formula 1 Search')
|
||||
st.subheader("Search for any Formula 1 rule questions you have")
|
||||
|
||||
prompt = st.text_input("Enter your search here","", key="input")
|
||||
|
||||
if st.button('Submit', key='generationSubmit'):
|
||||
result_df = get_redis_results(client,prompt,INDEX_NAME)
|
||||
|
||||
# Build a prompt to provide the original query, the result and ask to summarise for the user
|
||||
summary_prompt = '''Summarise this result in a bulleted list to answer the search query a customer has sent.
|
||||
Search query: SEARCH_QUERY_HERE
|
||||
Search result: SEARCH_RESULT_HERE
|
||||
Summary:
|
||||
'''
|
||||
summary_prepped = summary_prompt.replace('SEARCH_QUERY_HERE',prompt).replace('SEARCH_RESULT_HERE',result_df['result'][0])
|
||||
summary = openai.Completion.create(engine=COMPLETIONS_MODEL,prompt=summary_prepped,max_tokens=500)
|
||||
|
||||
# Response provided by GPT-3
|
||||
st.write(summary['choices'][0]['text'])
|
||||
|
||||
# Option to display raw table instead of summary from GPT-3
|
||||
#st.table(result_df)
|
116
apps/chatbot-kickstarter/transformers.py
Normal file
116
apps/chatbot-kickstarter/transformers.py
Normal file
@ -0,0 +1,116 @@
|
||||
from typing import Iterator
|
||||
from numpy import array, average
|
||||
import openai
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from config import TEXT_EMBEDDING_CHUNK_SIZE, EMBEDDINGS_MODEL
|
||||
from database import load_vectors
|
||||
|
||||
def get_col_average_from_list_of_lists(list_of_lists):
|
||||
"""Return the average of each column in a list of lists."""
|
||||
if len(list_of_lists) == 1:
|
||||
return list_of_lists[0]
|
||||
else:
|
||||
list_of_lists_array = array(list_of_lists)
|
||||
average_embedding = average(list_of_lists_array, axis=0)
|
||||
return average_embedding.tolist()
|
||||
|
||||
# Create embeddings for a text using a tokenizer and an OpenAI engine
|
||||
|
||||
|
||||
def create_embeddings_for_text(text, tokenizer):
|
||||
"""Return a list of tuples (text_chunk, embedding) and an average embedding for a text."""
|
||||
token_chunks = list(chunks(text, TEXT_EMBEDDING_CHUNK_SIZE, tokenizer))
|
||||
text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]
|
||||
|
||||
embeddings_response = get_embeddings(text_chunks, EMBEDDINGS_MODEL)
|
||||
embeddings = [embedding["embedding"] for embedding in embeddings_response]
|
||||
text_embeddings = list(zip(text_chunks, embeddings))
|
||||
|
||||
average_embedding = get_col_average_from_list_of_lists(embeddings)
|
||||
|
||||
return (text_embeddings, average_embedding)
|
||||
|
||||
def get_embeddings(text_array, engine):
|
||||
return openai.Engine(id=engine).embeddings(input=text_array)["data"]
|
||||
|
||||
# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
|
||||
def chunks(text, n, tokenizer):
|
||||
tokens = tokenizer.encode(text)
|
||||
"""Yield successive n-sized chunks from text."""
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
|
||||
j = min(i + int(1.5 * n), len(tokens))
|
||||
while j > i + int(0.5 * n):
|
||||
# Decode the tokens and check for full stop or newline
|
||||
chunk = tokenizer.decode(tokens[i:j])
|
||||
if chunk.endswith(".") or chunk.endswith("\n"):
|
||||
break
|
||||
j -= 1
|
||||
# If no end of sentence found, use n tokens as the chunk size
|
||||
if j == i + int(0.5 * n):
|
||||
j = min(i + n, len(tokens))
|
||||
yield tokens[i:j]
|
||||
i = j
|
||||
|
||||
def get_unique_id_for_file_chunk(filename, chunk_index):
|
||||
return str(filename+"-!"+str(chunk_index))
|
||||
|
||||
def handle_file_string(file,tokenizer,redis_conn, text_embedding_field,index_name):
|
||||
filename = file[0]
|
||||
file_body_string = file[1]
|
||||
|
||||
# Clean up the file string by replacing newlines and double spaces and semi-colons
|
||||
clean_file_body_string = file_body_string.replace(" ", " ").replace("\n", "; ").replace(';',' ')
|
||||
#
|
||||
# Add the filename to the text to embed
|
||||
text_to_embed = "Filename is: {}; {}".format(
|
||||
filename, clean_file_body_string)
|
||||
|
||||
# Create embeddings for the text
|
||||
try:
|
||||
text_embeddings, average_embedding = create_embeddings_for_text(
|
||||
text_to_embed, tokenizer)
|
||||
#print("[handle_file_string] Created embedding for {}".format(filename))
|
||||
except Exception as e:
|
||||
print("[handle_file_string] Error creating embedding: {}".format(e))
|
||||
|
||||
# Get the vectors array of triples: file_chunk_id, embedding, metadata for each embedding
|
||||
# Metadata is a dict with keys: filename, file_chunk_index
|
||||
vectors = []
|
||||
for i, (text_chunk, embedding) in enumerate(text_embeddings):
|
||||
id = get_unique_id_for_file_chunk(filename, i)
|
||||
vectors.append(({'id': id
|
||||
, "vector": embedding, 'metadata': {"filename": filename
|
||||
, "text_chunk": text_chunk
|
||||
, "file_chunk_index": i}}))
|
||||
|
||||
try:
|
||||
load_vectors(redis_conn, vectors,text_embedding_field)
|
||||
|
||||
except Exception as e:
|
||||
print(f'Ran into a problem uploading to Redis: {e}')
|
||||
|
||||
# Make a class to generate batches for insertion
|
||||
class BatchGenerator:
|
||||
|
||||
|
||||
def __init__(self, batch_size: int = 10) -> None:
|
||||
self.batch_size = batch_size
|
||||
|
||||
# Makes chunks out of an input DataFrame
|
||||
def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
|
||||
splits = self.splits_num(df.shape[0])
|
||||
if splits <= 1:
|
||||
yield df
|
||||
else:
|
||||
for chunk in np.array_split(df, splits):
|
||||
yield chunk
|
||||
|
||||
# Determines how many chunks DataFrame contains
|
||||
def splits_num(self, elements: int) -> int:
|
||||
return round(elements / self.batch_size)
|
||||
|
||||
__call__ = to_batches
|
Loading…
x
Reference in New Issue
Block a user