fixed code repository for solution one in Sharepoint solution (#1264)

This commit is contained in:
Max Reid 2024-06-28 15:38:54 -04:00 committed by GitHub
parent 774c524bd8
commit fb202f0369
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 19 additions and 102 deletions

View File

@ -145,7 +145,7 @@ See the documentation [here](https://learn.microsoft.com/en-us/azure/azure-funct
5. Leave all the other settings on this page as the default, but feel free to change based on your internal guidelines. 5. Leave all the other settings on this page as the default, but feel free to change based on your internal guidelines.
6. On the **permissions** tab, click **Add Permission** and add **Files.Read.All**, then **Add.** This allows this application to read files which is important in order to use the Microsoft Graph Search API. 6. On the **permissions** tab, click **Add Permission** and add **Files.Read.All** and **Sites.ReadAll**, then **Add.** This allows this application to read files which is important in order to use the Microsoft Graph Search API.
4. Once it is created, **click on the enterprise application you just created** (so, leave the Function App page and land on the Enterprise Application that you just spun up)**.** We are now going to give it one more permission, to execute the Azure Function by impersonating the user logging into the application. See [here](https://learn.microsoft.com/en-us/azure/app-service/configure-authentication-provider-aad?tabs=workforce-tenant) for more details. 4. Once it is created, **click on the enterprise application you just created** (so, leave the Function App page and land on the Enterprise Application that you just spun up)**.** We are now going to give it one more permission, to execute the Azure Function by impersonating the user logging into the application. See [here](https://learn.microsoft.com/en-us/azure/app-service/configure-authentication-provider-aad?tabs=workforce-tenant) for more details.

View File

@ -1,10 +1,9 @@
const { Client } = require('@microsoft/microsoft-graph-client'); const { Client } = require('@microsoft/microsoft-graph-client');
const pdfParse = require('pdf-parse');
const { Buffer } = require('buffer'); const { Buffer } = require('buffer');
const path = require('path'); const path = require('path');
const axios = require('axios'); const axios = require('axios');
const qs = require('querystring'); const qs = require('querystring');
const { OpenAI } = require("openai");
//// --------- ENVIRONMENT CONFIGURATION AND INITIALIZATION --------- //// --------- ENVIRONMENT CONFIGURATION AND INITIALIZATION ---------
// Function to initialize Microsoft Graph client // Function to initialize Microsoft Graph client
@ -46,79 +45,36 @@ const getOboToken = async (userAccessToken) => {
}; };
//// --------- DOCUMENT PROCESSING --------- //// --------- DOCUMENT PROCESSING ---------
// Function to fetch drive item content and convert to text // Function to fetch drive item content and convert to text
const getDriveItemContent = async (client, driveId, itemId, name) => { const getDriveItemContent = async (client, driveId, itemId, name) => {
try { try {
const fileType = path.extname(name).toLowerCase(); // const fileType = path.extname(name).toLowerCase();
// the below files types are the ones that are able to be converted to PDF to extract the text. See https://learn.microsoft.com/en-us/graph/api/driveitem-get-content-format?view=graph-rest-1.0&tabs=http // the below files types are the ones that are able to be converted to PDF to extract the text. See https://learn.microsoft.com/en-us/graph/api/driveitem-get-content-format?view=graph-rest-1.0&tabs=http
const allowedFileTypes = ['.pdf', '.doc', '.docx', '.odp', '.ods', '.odt', '.pot', '.potm', '.potx', '.pps', '.ppsx', '.ppsxm', '.ppt', '.pptm', '.pptx', '.rtf']; // const allowedFileTypes = ['.pdf', '.doc', '.docx', '.odp', '.ods', '.odt', '.pot', '.potm', '.potx', '.pps', '.ppsx', '.ppsxm', '.ppt', '.pptm', '.pptx', '.rtf'];
// filePath changes based on file type, adding ?format=pdf to convert non-pdf types to pdf for text extraction, so all files in allowedFileTypes above are converted to pdf // filePath changes based on file type, adding ?format=pdf to convert non-pdf types to pdf for text extraction, so all files in allowedFileTypes above are converted to pdf
const filePath = `/drives/${driveId}/items/${itemId}/content` + ((fileType === '.pdf' || fileType === '.txt' || fileType === '.csv') ? '' : '?format=pdf'); const filePath = `/drives/${driveId}/items/${itemId}`;
if (allowedFileTypes.includes(fileType)) { const downloadPath = filePath + `/content`
response = await client.api(filePath).getStream(); const fileStream = await client.api(downloadPath).getStream();
// The below takes the chunks in response and combines let chunks = [];
let chunks = []; for await (let chunk of fileStream) {
for await (let chunk of response) {
chunks.push(chunk); chunks.push(chunk);
} }
let buffer = Buffer.concat(chunks); const base64String = Buffer.concat(chunks).toString('base64');
// the below extracts the text from the PDF. const file = await client.api(filePath).get();
const pdfContents = await pdfParse(buffer); const mime_type = file.file.mimeType;
return pdfContents.text; const name = file.name;
} else if (fileType === '.txt') { return {"name":name, "mime_type":mime_type, "content":base64String}
// If the type is txt, it does not need to create a stream and instead just grabs the content
response = await client.api(filePath).get();
return response;
} else if (fileType === '.csv') {
response = await client.api(filePath).getStream();
let chunks = [];
for await (let chunk of response) {
chunks.push(chunk);
}
let buffer = Buffer.concat(chunks);
let dataString = buffer.toString('utf-8');
return dataString
} else {
return 'Unsupported File Type';
}
} catch (error) { } catch (error) {
console.error('Error fetching drive content:', error); console.error('Error fetching drive content:', error);
throw new Error(`Failed to fetch content for ${name}: ${error.message}`); throw new Error(`Failed to fetch content for ${name}: ${error.message}`);
} }
}; };
// Function to get relevant parts of text using gpt-3.5-turbo.
const getRelevantParts = async (text, query) => {
try {
// We use your OpenAI key to initialize the OpenAI client
const openAIKey = process.env["OPENAI_API_KEY"];
const openai = new OpenAI({
apiKey: openAIKey,
});
const response = await openai.chat.completions.create({
// Using gpt-3.5-turbo due to speed to prevent timeouts. You can tweak this prompt as needed
model: "gpt-3.5-turbo-0125",
messages: [
{"role": "system", "content": "You are a helpful assistant that finds relevant content in text based on a query. You only return the relevant sentences, and you return a maximum of 10 sentences"},
{"role": "user", "content": `Based on this question: **"${query}"**, get the relevant parts from the following text:*****\n\n${text}*****. If you cannot answer the question based on the text, respond with 'No information provided'`}
],
// using temperature of 0 since we want to just extract the relevant content
temperature: 0,
// using max_tokens of 1000, but you can customize this based on the number of documents you are searching.
max_tokens: 1000
});
return response.choices[0].message.content;
} catch (error) {
console.error('Error with OpenAI:', error);
return 'Error processing text with OpenAI' + error;
}
};
//// --------- AZURE FUNCTION LOGIC --------- //// --------- AZURE FUNCTION LOGIC ---------
// Below is what the Azure Function executes // Below is what the Azure Function executes
module.exports = async function (context, req) { module.exports = async function (context, req) {
const query = req.query.query || (req.body && req.body.query); // const query = req.query.query || (req.body && req.body.query);
const searchTerm = req.query.searchTerm || (req.body && req.body.searchTerm); const searchTerm = req.query.searchTerm || (req.body && req.body.searchTerm);
if (!req.headers.authorization) { if (!req.headers.authorization) {
context.res = { context.res = {
@ -157,25 +113,6 @@ module.exports = async function (context, req) {
}; };
try { try {
// Function to tokenize content (e.g., based on words).
const tokenizeContent = (content) => {
return content.split(/\s+/);
};
// Function to break tokens into 10k token windows for gpt-3.5-turbo
const breakIntoTokenWindows = (tokens) => {
const tokenWindows = []
const maxWindowTokens = 10000; // 10k tokens
let startIndex = 0;
while (startIndex < tokens.length) {
const window = tokens.slice(startIndex, startIndex + maxWindowTokens);
tokenWindows.push(window);
startIndex += maxWindowTokens;
}
return tokenWindows;
};
// This is where we are doing the search // This is where we are doing the search
const list = await client.api('/search/query').post(requestBody); const list = await client.api('/search/query').post(requestBody);
@ -187,30 +124,9 @@ module.exports = async function (context, req) {
for (const hit of container.hits) { for (const hit of container.hits) {
if (hit.resource["@odata.type"] === "#microsoft.graph.driveItem") { if (hit.resource["@odata.type"] === "#microsoft.graph.driveItem") {
const { name, id } = hit.resource; const { name, id } = hit.resource;
// We use the below to grab the URL of the file to include in the response
const webUrl = hit.resource.webUrl.replace(/\s/g, "%20");
// The Microsoft Graph API ranks the reponses, so we use this to order it
const rank = hit.rank;
// The below is where the file lives
const driveId = hit.resource.parentReference.driveId; const driveId = hit.resource.parentReference.driveId;
const contents = await getDriveItemContent(client, driveId, id, name); const contents = await getDriveItemContent(client, driveId, id, name);
if (contents !== 'Unsupported File Type') { results.push(contents)
// Tokenize content using function defined previously
const tokens = tokenizeContent(contents);
// Break tokens into 10k token windows
const tokenWindows = breakIntoTokenWindows(tokens);
// Process each token window and combine results
const relevantPartsPromises = tokenWindows.map(window => getRelevantParts(window.join(' '), query));
const relevantParts = await Promise.all(relevantPartsPromises);
const combinedResults = relevantParts.join('\n'); // Combine results
results.push({ name, webUrl, rank, contents: combinedResults });
}
else {
results.push({ name, webUrl, rank, contents: 'Unsupported File Type' });
}
} }
} }
})); }));
@ -224,7 +140,8 @@ module.exports = async function (context, req) {
} else { } else {
// If the Microsoft Graph API does return results, then run processList to iterate through. // If the Microsoft Graph API does return results, then run processList to iterate through.
results = await processList(); results = await processList();
results.sort((a, b) => a.rank - b.rank); results = {'openaiFileResponse': results}
// results.sort((a, b) => a.rank - b.rank);
} }
context.res = { context.res = {
status: 200, status: 200,