2024-06-03 05:10:15 -04:00
const { Client } = require ( '@microsoft/microsoft-graph-client' ) ;
const pdfParse = require ( 'pdf-parse' ) ;
const { Buffer } = require ( 'buffer' ) ;
const path = require ( 'path' ) ;
const axios = require ( 'axios' ) ;
const qs = require ( 'querystring' ) ;
const { OpenAI } = require ( "openai" ) ;
//// --------- ENVIRONMENT CONFIGURATION AND INITIALIZATION ---------
// Function to initialize Microsoft Graph client
const initGraphClient = ( accessToken ) => {
return Client . init ( {
authProvider : ( done ) => {
done ( null , accessToken ) ; // Pass the access token for Graph API calls
}
} ) ;
} ;
//// --------- AUTHENTICATION AND TOKEN MANAGEMENT ---------
// Function to obtain OBO token. This will take the access token in request header (scoped to this Function App) and generate a new token to use for Graph API
const getOboToken = async ( userAccessToken ) => {
const { TENANT _ID , CLIENT _ID , MICROSOFT _PROVIDER _AUTHENTICATION _SECRET } = process . env ;
const scope = 'https://graph.microsoft.com/.default' ;
const oboTokenUrl = ` https://login.microsoftonline.com/ ${ TENANT _ID } /oauth2/v2.0/token ` ;
const params = {
client _id : CLIENT _ID ,
client _secret : MICROSOFT _PROVIDER _AUTHENTICATION _SECRET ,
grant _type : 'urn:ietf:params:oauth:grant-type:jwt-bearer' ,
assertion : userAccessToken ,
requested _token _use : 'on_behalf_of' ,
scope : scope
} ;
try {
const response = await axios . post ( oboTokenUrl , qs . stringify ( params ) , {
headers : {
'Content-Type' : 'application/x-www-form-urlencoded'
}
} ) ;
return response . data . access _token ; // OBO token
} catch ( error ) {
console . error ( 'Error obtaining OBO token:' , error . response ? . data || error . message ) ;
throw error ;
}
} ;
//// --------- DOCUMENT PROCESSING ---------
// Function to fetch drive item content and convert to text
const getDriveItemContent = async ( client , driveId , itemId , name ) => {
try {
const fileType = path . extname ( name ) . toLowerCase ( ) ;
// the below files types are the ones that are able to be converted to PDF to extract the text. See https://learn.microsoft.com/en-us/graph/api/driveitem-get-content-format?view=graph-rest-1.0&tabs=http
const allowedFileTypes = [ '.pdf' , '.doc' , '.docx' , '.odp' , '.ods' , '.odt' , '.pot' , '.potm' , '.potx' , '.pps' , '.ppsx' , '.ppsxm' , '.ppt' , '.pptm' , '.pptx' , '.rtf' ] ;
// filePath changes based on file type, adding ?format=pdf to convert non-pdf types to pdf for text extraction, so all files in allowedFileTypes above are converted to pdf
const filePath = ` /drives/ ${ driveId } /items/ ${ itemId } /content ` + ( ( fileType === '.pdf' || fileType === '.txt' || fileType === '.csv' ) ? '' : '?format=pdf' ) ;
if ( allowedFileTypes . includes ( fileType ) ) {
response = await client . api ( filePath ) . getStream ( ) ;
// The below takes the chunks in response and combines
let chunks = [ ] ;
for await ( let chunk of response ) {
chunks . push ( chunk ) ;
}
let buffer = Buffer . concat ( chunks ) ;
// the below extracts the text from the PDF.
const pdfContents = await pdfParse ( buffer ) ;
return pdfContents . text ;
} else if ( fileType === '.txt' ) {
// If the type is txt, it does not need to create a stream and instead just grabs the content
response = await client . api ( filePath ) . get ( ) ;
return response ;
} else if ( fileType === '.csv' ) {
response = await client . api ( filePath ) . getStream ( ) ;
let chunks = [ ] ;
for await ( let chunk of response ) {
chunks . push ( chunk ) ;
}
let buffer = Buffer . concat ( chunks ) ;
let dataString = buffer . toString ( 'utf-8' ) ;
return dataString
} else {
return 'Unsupported File Type' ;
}
} catch ( error ) {
console . error ( 'Error fetching drive content:' , error ) ;
throw new Error ( ` Failed to fetch content for ${ name } : ${ error . message } ` ) ;
}
} ;
2024-07-22 15:23:47 -04:00
// Function to get relevant parts of text using got-4o-mini.
2024-06-03 05:10:15 -04:00
const getRelevantParts = async ( text , query ) => {
try {
// We use your OpenAI key to initialize the OpenAI client
const openAIKey = process . env [ "OPENAI_API_KEY" ] ;
const openai = new OpenAI ( {
apiKey : openAIKey ,
} ) ;
const response = await openai . chat . completions . create ( {
2024-07-25 15:41:27 +01:00
// Using gpt-4o-mini due to speed to prevent timeouts. You can tweak this prompt as needed
model : "gpt-4o-mini" ,
2024-06-03 05:10:15 -04:00
messages : [
{ "role" : "system" , "content" : "You are a helpful assistant that finds relevant content in text based on a query. You only return the relevant sentences, and you return a maximum of 10 sentences" } ,
{ "role" : "user" , "content" : ` Based on this question: **" ${ query } "**, get the relevant parts from the following text:***** \n \n ${ text } *****. If you cannot answer the question based on the text, respond with 'No information provided' ` }
] ,
// using temperature of 0 since we want to just extract the relevant content
temperature : 0 ,
// using max_tokens of 1000, but you can customize this based on the number of documents you are searching.
max _tokens : 1000
} ) ;
return response . choices [ 0 ] . message . content ;
} catch ( error ) {
console . error ( 'Error with OpenAI:' , error ) ;
return 'Error processing text with OpenAI' + error ;
}
} ;
//// --------- AZURE FUNCTION LOGIC ---------
// Below is what the Azure Function executes
module . exports = async function ( context , req ) {
const query = req . query . query || ( req . body && req . body . query ) ;
const searchTerm = req . query . searchTerm || ( req . body && req . body . searchTerm ) ;
if ( ! req . headers . authorization ) {
context . res = {
status : 400 ,
body : 'Authorization header is missing'
} ;
return ;
}
/// The below takes the token passed to the function, to use to get an OBO token.
const bearerToken = req . headers . authorization . split ( ' ' ) [ 1 ] ;
let accessToken ;
try {
accessToken = await getOboToken ( bearerToken ) ;
} catch ( error ) {
context . res = {
status : 500 ,
body : ` Failed to obtain OBO token: ${ error . message } `
} ;
return ;
}
// Initialize the Graph Client using the initGraphClient function defined above
let client = initGraphClient ( accessToken ) ;
// this is the search body to be used in the Microsft Graph Search API: https://learn.microsoft.com/en-us/graph/search-concept-files
const requestBody = {
requests : [
{
entityTypes : [ 'driveItem' ] ,
query : {
queryString : searchTerm
} ,
from : 0 ,
// the below is set to summarize the top 10 search results from the Graph API, but can configure based on your documents.
size : 10
}
]
} ;
try {
// Function to tokenize content (e.g., based on words).
const tokenizeContent = ( content ) => {
return content . split ( /\s+/ ) ;
} ;
2024-07-22 15:23:47 -04:00
// Function to break tokens into 10k token windows for got-4o-mini
2024-06-03 05:10:15 -04:00
const breakIntoTokenWindows = ( tokens ) => {
const tokenWindows = [ ]
const maxWindowTokens = 10000 ; // 10k tokens
let startIndex = 0 ;
while ( startIndex < tokens . length ) {
const window = tokens . slice ( startIndex , startIndex + maxWindowTokens ) ;
tokenWindows . push ( window ) ;
startIndex += maxWindowTokens ;
}
return tokenWindows ;
} ;
// This is where we are doing the search
const list = await client . api ( '/search/query' ) . post ( requestBody ) ;
const processList = async ( ) => {
2024-07-22 15:23:47 -04:00
// This will go through and for each search response, grab the contents of the file and summarize with got-4o-mini
2024-06-03 05:10:15 -04:00
const results = [ ] ;
await Promise . all ( list . value [ 0 ] . hitsContainers . map ( async ( container ) => {
for ( const hit of container . hits ) {
if ( hit . resource [ "@odata.type" ] === "#microsoft.graph.driveItem" ) {
const { name , id } = hit . resource ;
// We use the below to grab the URL of the file to include in the response
const webUrl = hit . resource . webUrl . replace ( /\s/g , "%20" ) ;
// The Microsoft Graph API ranks the reponses, so we use this to order it
const rank = hit . rank ;
// The below is where the file lives
const driveId = hit . resource . parentReference . driveId ;
const contents = await getDriveItemContent ( client , driveId , id , name ) ;
if ( contents !== 'Unsupported File Type' ) {
// Tokenize content using function defined previously
const tokens = tokenizeContent ( contents ) ;
// Break tokens into 10k token windows
const tokenWindows = breakIntoTokenWindows ( tokens ) ;
// Process each token window and combine results
const relevantPartsPromises = tokenWindows . map ( window => getRelevantParts ( window . join ( ' ' ) , query ) ) ;
const relevantParts = await Promise . all ( relevantPartsPromises ) ;
const combinedResults = relevantParts . join ( '\n' ) ; // Combine results
results . push ( { name , webUrl , rank , contents : combinedResults } ) ;
}
else {
results . push ( { name , webUrl , rank , contents : 'Unsupported File Type' } ) ;
}
}
}
} ) ) ;
return results ;
} ;
let results ;
if ( list . value [ 0 ] . hitsContainers [ 0 ] . total == 0 ) {
// Return no results found to the API if the Microsoft Graph API returns no results
results = 'No results found' ;
} else {
// If the Microsoft Graph API does return results, then run processList to iterate through.
results = await processList ( ) ;
results . sort ( ( a , b ) => a . rank - b . rank ) ;
}
context . res = {
status : 200 ,
body : results
} ;
} catch ( error ) {
context . res = {
status : 500 ,
body : ` Error performing search or processing results: ${ error . message } ` ,
} ;
}
} ;