mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
Pushing enterprise knowledge retrieval to cookbook
This commit is contained in:
parent
7d418b9bf2
commit
a3918a9d60
36
apps/enterprise-knowledge-retrieval/README.md
Normal file
36
apps/enterprise-knowledge-retrieval/README.md
Normal file
@ -0,0 +1,36 @@
|
||||
# Enterprise Knowledge Retrieval
|
||||
|
||||
This repo is a deep dive on Enterprise Knowledge Retrieval, which aims to take some unstructured text documents and create a usable knowledge base application with it.
|
||||
|
||||
This repo contains a notebook and a basic Streamlit app:
|
||||
- `enterprise_knowledge_retrieval.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, building a chat agent on top and running a basic evaluation of its performance
|
||||
- `chatbot.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
|
||||
|
||||
To run the app, please follow the instructions below in the ```App``` section
|
||||
|
||||
## Notebook
|
||||
|
||||
The notebook is the best place to start, and takes you through an end-to-end workflow for setting up and evaluating a simple back-end knowledge retrieval service:
|
||||
- **Setup:** Initiate variables and connect to a vector database.
|
||||
- **Storage:** Configure the database, prepare our data and store embeddings and metadata for retrieval.
|
||||
- **Search:** Extract relevant documents back out with a basic search function and use an LLM to summarise results into a concise reply.
|
||||
- **Answer:** Add a more sophisticated agent which will process the user's query and maintain a memory for follow-up questions.
|
||||
- **Evaluate:** Take question/answer pairs using our service, evaluate and plot them to scope out remedial action
|
||||
|
||||
Once you've run the notebook through to the Search stage, you should have what you need to set up and run the app.
|
||||
|
||||
## App
|
||||
|
||||
We've rolled in a basic Streamlit app that you can interact with to test your retrieval service using either standard semantic search or Hyde retrievals.
|
||||
|
||||
You can use it by:
|
||||
- Ensuring you followed the Setup and Storage steps from the notebook to populate a vector database with searchable content.
|
||||
- Setting up a virtual environment with pip by running ```virtualenv venv``` (ensure ```virtualenv``` is installed).
|
||||
- Activate the environment by running ```source venv/bin/activate```.
|
||||
- Install requirements by running ```pip install -r requirements.txt```.
|
||||
- Run ```streamlit run chatbot.py``` to fire up the Streamlit app in your browser
|
||||
|
||||
## Limitations
|
||||
|
||||
- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
|
||||
- We introduce many areas you may optimize in the notebook, but we'll deep dive on these in separate offerings in the coming weeks.
|
169
apps/enterprise-knowledge-retrieval/assistant.py
Normal file
169
apps/enterprise-knowledge-retrieval/assistant.py
Normal file
@ -0,0 +1,169 @@
|
||||
from langchain.agents import (
|
||||
Tool,
|
||||
AgentExecutor,
|
||||
LLMSingleActionAgent,
|
||||
AgentOutputParser,
|
||||
)
|
||||
from langchain.prompts import BaseChatPromptTemplate
|
||||
from langchain import SerpAPIWrapper, LLMChain
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from typing import List, Union
|
||||
from langchain.schema import AgentAction, AgentFinish, HumanMessage
|
||||
from langchain.memory import ConversationBufferWindowMemory
|
||||
import openai
|
||||
import re
|
||||
import streamlit as st
|
||||
|
||||
from database import get_redis_results, get_redis_connection
|
||||
from config import RETRIEVAL_PROMPT, CHAT_MODEL, INDEX_NAME, SYSTEM_PROMPT
|
||||
|
||||
|
||||
redis_client = get_redis_connection()
|
||||
|
||||
|
||||
def answer_user_question(query):
|
||||
|
||||
results = get_redis_results(redis_client, query, INDEX_NAME)
|
||||
|
||||
results.to_csv("results.csv")
|
||||
|
||||
search_content = ""
|
||||
for x, y in results.head(3).iterrows():
|
||||
search_content += y["title"] + "\n" + y["result"] + "\n\n"
|
||||
|
||||
retrieval_prepped = RETRIEVAL_PROMPT.format(
|
||||
SEARCH_QUERY_HERE=query, SEARCH_CONTENT_HERE=search_content
|
||||
)
|
||||
|
||||
retrieval = openai.ChatCompletion.create(
|
||||
model=CHAT_MODEL,
|
||||
messages=[{"role": "user", "content": retrieval_prepped}],
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
# Response provided by GPT-3.5
|
||||
return retrieval["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
def answer_question_hyde(query):
|
||||
|
||||
hyde_prompt = """You are OracleGPT, an helpful expert who answers user questions to the best of their ability.
|
||||
Provide a confident answer to their question. If you don't know the answer, make the best guess you can based on the context of the question.
|
||||
|
||||
User question: {USER_QUESTION_HERE}
|
||||
|
||||
Answer:"""
|
||||
|
||||
hypothetical_answer = openai.ChatCompletion.create(
|
||||
model=CHAT_MODEL,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": hyde_prompt.format(USER_QUESTION_HERE=query),
|
||||
}
|
||||
],
|
||||
)["choices"][0]["message"]["content"]
|
||||
# st.write(hypothetical_answer)
|
||||
results = get_redis_results(redis_client, hypothetical_answer, INDEX_NAME)
|
||||
|
||||
results.to_csv("results.csv")
|
||||
|
||||
search_content = ""
|
||||
for x, y in results.head(3).iterrows():
|
||||
search_content += y["title"] + "\n" + y["result"] + "\n\n"
|
||||
|
||||
retrieval_prepped = RETRIEVAL_PROMPT.replace("SEARCH_QUERY_HERE", query).replace(
|
||||
"SEARCH_CONTENT_HERE", search_content
|
||||
)
|
||||
retrieval = openai.ChatCompletion.create(
|
||||
model=CHAT_MODEL,
|
||||
messages=[{"role": "user", "content": retrieval_prepped}],
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
return retrieval["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
# Set up a prompt template
|
||||
class CustomPromptTemplate(BaseChatPromptTemplate):
|
||||
# The template to use
|
||||
template: str
|
||||
# The list of tools available
|
||||
tools: List[Tool]
|
||||
|
||||
def format_messages(self, **kwargs) -> str:
|
||||
# Get the intermediate steps (AgentAction, Observation tuples)
|
||||
# Format them in a particular way
|
||||
intermediate_steps = kwargs.pop("intermediate_steps")
|
||||
thoughts = ""
|
||||
for action, observation in intermediate_steps:
|
||||
thoughts += action.log
|
||||
thoughts += f"\nObservation: {observation}\nThought: "
|
||||
# Set the agent_scratchpad variable to that value
|
||||
kwargs["agent_scratchpad"] = thoughts
|
||||
# Create a tools variable from the list of tools provided
|
||||
kwargs["tools"] = "\n".join(
|
||||
[f"{tool.name}: {tool.description}" for tool in self.tools]
|
||||
)
|
||||
# Create a list of tool names for the tools provided
|
||||
kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
|
||||
formatted = self.template.format(**kwargs)
|
||||
return [HumanMessage(content=formatted)]
|
||||
|
||||
|
||||
class CustomOutputParser(AgentOutputParser):
|
||||
def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
|
||||
# Check if agent should finish
|
||||
if "Final Answer:" in llm_output:
|
||||
return AgentFinish(
|
||||
# Return values is generally always a dictionary with a single `output` key
|
||||
# It is not recommended to try anything else at the moment :)
|
||||
return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
|
||||
log=llm_output,
|
||||
)
|
||||
# Parse out the action and action input
|
||||
regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
|
||||
match = re.search(regex, llm_output, re.DOTALL)
|
||||
if not match:
|
||||
raise ValueError(f"Could not parse LLM output: `{llm_output}`")
|
||||
action = match.group(1).strip()
|
||||
action_input = match.group(2)
|
||||
# Return the action and action input
|
||||
return AgentAction(
|
||||
tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output
|
||||
)
|
||||
|
||||
|
||||
def initiate_agent(tools):
|
||||
prompt = CustomPromptTemplate(
|
||||
template=SYSTEM_PROMPT,
|
||||
tools=tools,
|
||||
# This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
|
||||
# The history template includes "history" as an input variable so we can interpolate it into the prompt
|
||||
input_variables=["input", "intermediate_steps", "history"],
|
||||
)
|
||||
|
||||
# Initiate the memory with k=2 to keep the last two turns
|
||||
# Provide the memory to the agent
|
||||
memory = ConversationBufferWindowMemory(k=2)
|
||||
|
||||
output_parser = CustomOutputParser()
|
||||
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
|
||||
# LLM chain consisting of the LLM and a prompt
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
|
||||
tool_names = [tool.name for tool in tools]
|
||||
agent = LLMSingleActionAgent(
|
||||
llm_chain=llm_chain,
|
||||
output_parser=output_parser,
|
||||
stop=["\nObservation:"],
|
||||
allowed_tools=tool_names,
|
||||
)
|
||||
|
||||
agent_executor = AgentExecutor.from_agent_and_tools(
|
||||
agent=agent, tools=tools, verbose=True, memory=memory
|
||||
)
|
||||
|
||||
return agent_executor
|
76
apps/enterprise-knowledge-retrieval/chatbot.py
Normal file
76
apps/enterprise-knowledge-retrieval/chatbot.py
Normal file
@ -0,0 +1,76 @@
|
||||
from langchain.agents import Tool
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
from streamlit_chat import message
|
||||
|
||||
from database import get_redis_connection
|
||||
from assistant import answer_user_question, initiate_agent, answer_question_hyde
|
||||
|
||||
# Initialise database
|
||||
|
||||
## Initialise Redis connection
|
||||
redis_client = get_redis_connection()
|
||||
|
||||
|
||||
### CHATBOT APP
|
||||
|
||||
# --- GENERAL SETTINGS ---
|
||||
PAGE_TITLE: str = "Knowledge Retrieval Bot"
|
||||
PAGE_ICON: str = "🤖"
|
||||
|
||||
st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
|
||||
|
||||
st.title("Wiki Chatbot")
|
||||
st.subheader("Learn things - random things!")
|
||||
|
||||
# Using object notation
|
||||
add_selectbox = st.sidebar.selectbox(
|
||||
"What kind of search?", ("Standard vector search", "HyDE")
|
||||
)
|
||||
|
||||
# Define which tools the agent can use to answer user queries
|
||||
tools = [
|
||||
Tool(
|
||||
name="Search",
|
||||
func=answer_user_question
|
||||
if add_selectbox == "Standard vector search"
|
||||
else answer_question_hyde,
|
||||
description="Useful for when you need to answer general knowledge questions. Input should be a fully formed question.",
|
||||
)
|
||||
]
|
||||
|
||||
if "generated" not in st.session_state:
|
||||
st.session_state["generated"] = []
|
||||
|
||||
if "past" not in st.session_state:
|
||||
st.session_state["past"] = []
|
||||
|
||||
|
||||
def query(question):
|
||||
response = st.session_state["chat"].ask_assistant(question)
|
||||
return response
|
||||
|
||||
|
||||
prompt = st.text_input("What do you want to know: ", "", key="input")
|
||||
|
||||
if st.button("Submit", key="generationSubmit"):
|
||||
with st.spinner("Thinking..."):
|
||||
# Initialization
|
||||
if "agent" not in st.session_state:
|
||||
st.session_state["agent"] = initiate_agent(tools)
|
||||
|
||||
response = st.session_state["agent"].run(prompt)
|
||||
|
||||
st.session_state.past.append(prompt)
|
||||
st.session_state.generated.append(response)
|
||||
|
||||
if len(st.session_state["generated"]) > 0:
|
||||
for i in range(len(st.session_state["generated"]) - 1, -1, -1):
|
||||
message(st.session_state["generated"][i], key=str(i))
|
||||
message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
|
||||
|
||||
with st.expander("See search results"):
|
||||
|
||||
results = list(pd.read_csv("results.csv")["result"])
|
||||
|
||||
st.write(results)
|
46
apps/enterprise-knowledge-retrieval/config.py
Normal file
46
apps/enterprise-knowledge-retrieval/config.py
Normal file
@ -0,0 +1,46 @@
|
||||
REDIS_HOST = "localhost"
|
||||
REDIS_PORT = "6380"
|
||||
REDIS_DB = "0"
|
||||
INDEX_NAME = "wiki-index"
|
||||
VECTOR_FIELD_NAME = "content_vector"
|
||||
CHAT_MODEL = "gpt-3.5-turbo"
|
||||
EMBEDDINGS_MODEL = "text-embedding-ada-002"
|
||||
# Set up the base template
|
||||
SYSTEM_PROMPT = """You are WikiGPT, a helpful bot who has access to a database of Wikipedia data to answer questions.
|
||||
Accept the first answer that you are provided for the user.
|
||||
You have access to the following tools::
|
||||
|
||||
{tools}
|
||||
|
||||
Use the following format:
|
||||
|
||||
Question: the input question you must answer
|
||||
Thought: you should always think about what to do
|
||||
Action: the action to take, should be one of [{tool_names}]
|
||||
Action Input: the input to the action
|
||||
Observation: the result of the action
|
||||
... (this Thought/Action/Action Input/Observation can repeat N times)
|
||||
Thought: I now know the final answer
|
||||
Final Answer: the final answer to the original input question
|
||||
|
||||
Begin! Remember to give detailed, informative answers
|
||||
|
||||
Previous conversation history:
|
||||
{history}
|
||||
|
||||
New question: {input}
|
||||
{agent_scratchpad}"""
|
||||
# Build a prompt to provide the original query, the result and ask to summarise for the user
|
||||
RETRIEVAL_PROMPT = """Use the content to answer the search query the customer has sent. Provide the source for your answer.
|
||||
If you can't answer the user's question, say "Sorry, I am unable to answer the question with the content". Do not guess.
|
||||
|
||||
Search query:
|
||||
|
||||
{SEARCH_QUERY_HERE}
|
||||
|
||||
Content:
|
||||
|
||||
{SEARCH_CONTENT_HERE}
|
||||
|
||||
Answer:
|
||||
"""
|
60403
apps/enterprise-knowledge-retrieval/data/wikipedia_articles_2000.csv
Normal file
60403
apps/enterprise-knowledge-retrieval/data/wikipedia_articles_2000.csv
Normal file
File diff suppressed because it is too large
Load Diff
72
apps/enterprise-knowledge-retrieval/database.py
Normal file
72
apps/enterprise-knowledge-retrieval/database.py
Normal file
@ -0,0 +1,72 @@
|
||||
import ast
|
||||
from math import isnan
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import openai
|
||||
from redis import Redis as r
|
||||
from redis.commands.search.query import Query
|
||||
|
||||
from config import (
|
||||
REDIS_DB,
|
||||
REDIS_HOST,
|
||||
REDIS_PORT,
|
||||
VECTOR_FIELD_NAME,
|
||||
EMBEDDINGS_MODEL,
|
||||
INDEX_NAME,
|
||||
)
|
||||
|
||||
|
||||
def get_redis_connection():
|
||||
redis_client = r(
|
||||
host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=False
|
||||
)
|
||||
return redis_client
|
||||
|
||||
|
||||
# Make query to Redis
|
||||
def query_redis(redis_conn, query, index_name, top_k=5):
|
||||
|
||||
## Creates embedding vector from user query
|
||||
embedded_query = np.array(
|
||||
openai.Embedding.create(input=query, model=EMBEDDINGS_MODEL,)["data"][
|
||||
0
|
||||
]["embedding"],
|
||||
dtype=np.float32,
|
||||
).tobytes()
|
||||
|
||||
# prepare the query
|
||||
q = (
|
||||
Query(f"*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]")
|
||||
.sort_by("vector_score")
|
||||
.paging(0, top_k)
|
||||
.return_fields("vector_score", "url", "title", "content", "text_chunk_index")
|
||||
.dialect(2)
|
||||
)
|
||||
params_dict = {"vec_param": embedded_query}
|
||||
|
||||
# Execute the query
|
||||
results = redis_conn.ft(index_name).search(q, query_params=params_dict)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# Get mapped documents from Redis results
|
||||
def get_redis_results(redis_conn, query, index_name):
|
||||
|
||||
# Get most relevant documents from Redis
|
||||
query_result = query_redis(redis_conn, query, index_name)
|
||||
|
||||
# Extract info into a list
|
||||
query_result_list = []
|
||||
for i, result in enumerate(query_result.docs):
|
||||
result_order = i
|
||||
url = result.url
|
||||
title = result.title
|
||||
text = result.content
|
||||
score = result.vector_score
|
||||
query_result_list.append((result_order, url, title, text, score))
|
||||
|
||||
# Display result as a DataFrame for ease of us
|
||||
result_df = pd.DataFrame(query_result_list)
|
||||
result_df.columns = ["id", "url", "title", "result", "certainty"]
|
||||
return result_df
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 72 KiB |
7
apps/enterprise-knowledge-retrieval/requirements.txt
Normal file
7
apps/enterprise-knowledge-retrieval/requirements.txt
Normal file
@ -0,0 +1,7 @@
|
||||
langchain==0.0.158
|
||||
numpy==1.24.2
|
||||
openai==0.27.4
|
||||
pandas==2.0.0
|
||||
redis==4.5.4
|
||||
streamlit==1.22.0
|
||||
streamlit_chat==0.0.2.2
|
Loading…
x
Reference in New Issue
Block a user