openai-cookbook/examples/chatgpt/sharepoint_azure_function/solution_two_preprocessing.js

const { Client } = require('@microsoft/microsoft-graph-client');
const pdfParse = require('pdf-parse');
const { Buffer } = require('buffer');
const path = require('path');
const axios = require('axios');
const qs = require('querystring');
const { OpenAI } = require("openai");

//// --------- ENVIRONMENT CONFIGURATION AND INITIALIZATION ---------
// Function to initialize Microsoft Graph client
const initGraphClient = (accessToken) => {
    return Client.init({
        authProvider: (done) => {
            done(null, accessToken); // Pass the access token for Graph API calls
        }
    });
};

//// --------- AUTHENTICATION AND TOKEN MANAGEMENT ---------
// Function to obtain OBO token. This will take the access token in request header (scoped to this Function App) and generate a new token to use for Graph API
const getOboToken = async (userAccessToken) => {
    const { TENANT_ID, CLIENT_ID,  MICROSOFT_PROVIDER_AUTHENTICATION_SECRET } = process.env;
    const scope = 'https://graph.microsoft.com/.default';
    const oboTokenUrl = `https://login.microsoftonline.com/${TENANT_ID}/oauth2/v2.0/token`;

    const params = {
        client_id: CLIENT_ID,
        client_secret: MICROSOFT_PROVIDER_AUTHENTICATION_SECRET,
        grant_type: 'urn:ietf:params:oauth:grant-type:jwt-bearer',
        assertion: userAccessToken,
        requested_token_use: 'on_behalf_of',
        scope: scope
    };

    try {
        const response = await axios.post(oboTokenUrl, qs.stringify(params), {
            headers: {
                'Content-Type': 'application/x-www-form-urlencoded'
            }
        });
        return response.data.access_token; // OBO token
    } catch (error) {
        console.error('Error obtaining OBO token:', error.response?.data || error.message);
        throw error;
    }
};
//// --------- DOCUMENT PROCESSING ---------
// Function to fetch drive item content and convert to text
const getDriveItemContent = async (client, driveId, itemId, name) => {
    try {
        const fileType = path.extname(name).toLowerCase();
        // the below files types are the ones that are able to be converted to PDF to extract the text. See https://learn.microsoft.com/en-us/graph/api/driveitem-get-content-format?view=graph-rest-1.0&tabs=http
        const allowedFileTypes = ['.pdf', '.doc', '.docx', '.odp', '.ods', '.odt', '.pot', '.potm', '.potx', '.pps', '.ppsx', '.ppsxm', '.ppt', '.pptm', '.pptx', '.rtf'];
        // filePath changes based on file type, adding ?format=pdf to convert non-pdf types to pdf for text extraction, so all files in allowedFileTypes above are converted to pdf
        const filePath = `/drives/${driveId}/items/${itemId}/content` + ((fileType === '.pdf' || fileType === '.txt' || fileType === '.csv') ? '' : '?format=pdf');
        if (allowedFileTypes.includes(fileType)) {
            response = await client.api(filePath).getStream();
            // The below takes the chunks in response and combines
            let chunks = [];
            for await (let chunk of response) {
                chunks.push(chunk);
            }
            let buffer = Buffer.concat(chunks);
            // the below extracts the text from the PDF.
            const pdfContents = await pdfParse(buffer);
            return pdfContents.text;
        } else if (fileType === '.txt') {
            // If the type is txt, it does not need to create a stream and instead just grabs the content
            response = await client.api(filePath).get();
            return response;
        }  else if (fileType === '.csv') {
            response = await client.api(filePath).getStream();
            let chunks = [];
            for await (let chunk of response) {
                chunks.push(chunk);
            }
            let buffer = Buffer.concat(chunks);
            let dataString = buffer.toString('utf-8');
            return dataString
            
    } else {
        return 'Unsupported File Type';
    }
     
    } catch (error) {
        console.error('Error fetching drive content:', error);
        throw new Error(`Failed to fetch content for ${name}: ${error.message}`);
    }
};

// Function to get relevant parts of text using got-4o-mini. 
const getRelevantParts = async (text, query) => {
    try {
        // We use your OpenAI key to initialize the OpenAI client
        const openAIKey = process.env["OPENAI_API_KEY"];
        const openai = new OpenAI({
            apiKey: openAIKey,
        });
        const response = await openai.chat.completions.create({
            // Using gpt-4o-mini due to speed to prevent timeouts. You can tweak this prompt as needed
            model: "gpt-4o-mini",
            messages: [
                {"role": "system", "content": "You are a helpful assistant that finds relevant content in text based on a query. You only return the relevant sentences, and you return a maximum of 10 sentences"},
                {"role": "user", "content": `Based on this question: **"${query}"**, get the relevant parts from the following text:*****\n\n${text}*****. If you cannot answer the question based on the text, respond with 'No information provided'`}
            ],
            // using temperature of 0 since we want to just extract the relevant content
            temperature: 0,
            // using max_tokens of 1000, but you can customize this based on the number of documents you are searching. 
            max_tokens: 1000
        });
        return response.choices[0].message.content;
    } catch (error) {
        console.error('Error with OpenAI:', error);
        return 'Error processing text with OpenAI' + error;
    }
};

//// --------- AZURE FUNCTION LOGIC ---------
// Below is what the Azure Function executes
module.exports = async function (context, req) {
    const query = req.query.query || (req.body && req.body.query);
    const searchTerm = req.query.searchTerm || (req.body && req.body.searchTerm);
    if (!req.headers.authorization) {
        context.res = {
            status: 400,
            body: 'Authorization header is missing'
        };
        return;
    }
    /// The below takes the token passed to the function, to use to get an OBO token.
    const bearerToken = req.headers.authorization.split(' ')[1];
    let accessToken;
    try {
        accessToken = await getOboToken(bearerToken);
    } catch (error) {
        context.res = {
            status: 500,
            body: `Failed to obtain OBO token: ${error.message}`
        };
        return;
    }
    // Initialize the Graph Client using the initGraphClient function defined above
    let client = initGraphClient(accessToken);
    // this is the search body to be used in the Microsft Graph Search API: https://learn.microsoft.com/en-us/graph/search-concept-files
    const requestBody = {
        requests: [
            {
                entityTypes: ['driveItem'],
                query: {
                    queryString: searchTerm
                },
                from: 0,
                // the below is set to summarize the top 10 search results from the Graph API, but can configure based on your documents. 
                size: 10
            }
        ]
    };

    try { 
        // Function to tokenize content (e.g., based on words). 
        const tokenizeContent = (content) => {
            return content.split(/\s+/);
        };

        // Function to break tokens into 10k token windows for got-4o-mini
        const breakIntoTokenWindows = (tokens) => {
            const tokenWindows = []
            const maxWindowTokens = 10000; // 10k tokens
            let startIndex = 0;

            while (startIndex < tokens.length) {
                const window = tokens.slice(startIndex, startIndex + maxWindowTokens);
                tokenWindows.push(window);
                startIndex += maxWindowTokens;
            }

            return tokenWindows;
        };
        // This is where we are doing the search
        const list = await client.api('/search/query').post(requestBody);

        const processList = async () => {
            // This will go through and for each search response, grab the contents of the file and summarize with got-4o-mini
            const results = [];

            await Promise.all(list.value[0].hitsContainers.map(async (container) => {
                for (const hit of container.hits) {
                    if (hit.resource["@odata.type"] === "#microsoft.graph.driveItem") {
                        const { name, id } = hit.resource;
                        // We use the below to grab the URL of the file to include in the response
                        const webUrl = hit.resource.webUrl.replace(/\s/g, "%20");
                        // The Microsoft Graph API ranks the reponses, so we use this to order it
                        const rank = hit.rank;
                        // The below is where the file lives
                        const driveId = hit.resource.parentReference.driveId;
                        const contents = await getDriveItemContent(client, driveId, id, name);
                        if (contents !== 'Unsupported File Type') {
                            // Tokenize content using function defined previously
                            const tokens = tokenizeContent(contents);

                            // Break tokens into 10k token windows
                            const tokenWindows = breakIntoTokenWindows(tokens);

                            // Process each token window and combine results
                            const relevantPartsPromises = tokenWindows.map(window => getRelevantParts(window.join(' '), query));
                            const relevantParts = await Promise.all(relevantPartsPromises);
                            const combinedResults = relevantParts.join('\n'); // Combine results

                            results.push({ name, webUrl, rank, contents: combinedResults });
                        } 
                        else {
                            results.push({ name, webUrl, rank, contents: 'Unsupported File Type' });
                        }
                    }
                }
            }));

            return results;
        };
        let results;
        if (list.value[0].hitsContainers[0].total == 0) {
            // Return no results found to the API if the Microsoft Graph API returns no results
            results = 'No results found';
        } else {
            // If the Microsoft Graph API does return results, then run processList to iterate through.
            results = await processList();
            results.sort((a, b) => a.rank - b.rank);
        }
        context.res = {
            status: 200,
            body: results
        };
    } catch (error) {
        context.res = {
            status: 500,
            body: `Error performing search or processing results: ${error.message}`,
        };
    }
};
ChatGPT folder + Azure Function example (#1231) 2024-06-03 05:10:15 -04:00			`const { Client } = require('@microsoft/microsoft-graph-client');`
			`const pdfParse = require('pdf-parse');`
			`const { Buffer } = require('buffer');`
			`const path = require('path');`
			`const axios = require('axios');`
			`const qs = require('querystring');`
			`const { OpenAI } = require("openai");`

			`//// --------- ENVIRONMENT CONFIGURATION AND INITIALIZATION ---------`
			`// Function to initialize Microsoft Graph client`
			`const initGraphClient = (accessToken) => {`
			`return Client.init({`
			`authProvider: (done) => {`
			`done(null, accessToken); // Pass the access token for Graph API calls`
			`}`
			`});`
			`};`

			`//// --------- AUTHENTICATION AND TOKEN MANAGEMENT ---------`
			`// Function to obtain OBO token. This will take the access token in request header (scoped to this Function App) and generate a new token to use for Graph API`
			`const getOboToken = async (userAccessToken) => {`
			`const { TENANT_ID, CLIENT_ID, MICROSOFT_PROVIDER_AUTHENTICATION_SECRET } = process.env;`
			`const scope = 'https://graph.microsoft.com/.default';`
			const oboTokenUrl = `https://login.microsoftonline.com/${TENANT_ID}/oauth2/v2.0/token`;

			`const params = {`
			`client_id: CLIENT_ID,`
			`client_secret: MICROSOFT_PROVIDER_AUTHENTICATION_SECRET,`
			`grant_type: 'urn:ietf:params:oauth:grant-type:jwt-bearer',`
			`assertion: userAccessToken,`
			`requested_token_use: 'on_behalf_of',`
			`scope: scope`
			`};`

			`try {`
			`const response = await axios.post(oboTokenUrl, qs.stringify(params), {`
			`headers: {`
			`'Content-Type': 'application/x-www-form-urlencoded'`
			`}`
			`});`
			`return response.data.access_token; // OBO token`
			`} catch (error) {`
			`console.error('Error obtaining OBO token:', error.response?.data \|\| error.message);`
			`throw error;`
			`}`
			`};`
			`//// --------- DOCUMENT PROCESSING ---------`
			`// Function to fetch drive item content and convert to text`
			`const getDriveItemContent = async (client, driveId, itemId, name) => {`
			`try {`
			`const fileType = path.extname(name).toLowerCase();`
			`// the below files types are the ones that are able to be converted to PDF to extract the text. See https://learn.microsoft.com/en-us/graph/api/driveitem-get-content-format?view=graph-rest-1.0&tabs=http`
			`const allowedFileTypes = ['.pdf', '.doc', '.docx', '.odp', '.ods', '.odt', '.pot', '.potm', '.potx', '.pps', '.ppsx', '.ppsxm', '.ppt', '.pptm', '.pptx', '.rtf'];`
			`// filePath changes based on file type, adding ?format=pdf to convert non-pdf types to pdf for text extraction, so all files in allowedFileTypes above are converted to pdf`
			const filePath = `/drives/${driveId}/items/${itemId}/content` + ((fileType === '.pdf' \|\| fileType === '.txt' \|\| fileType === '.csv') ? '' : '?format=pdf');
			`if (allowedFileTypes.includes(fileType)) {`
			`response = await client.api(filePath).getStream();`
			`// The below takes the chunks in response and combines`
			`let chunks = [];`
			`for await (let chunk of response) {`
			`chunks.push(chunk);`
			`}`
			`let buffer = Buffer.concat(chunks);`
			`// the below extracts the text from the PDF.`
			`const pdfContents = await pdfParse(buffer);`
			`return pdfContents.text;`
			`} else if (fileType === '.txt') {`
			`// If the type is txt, it does not need to create a stream and instead just grabs the content`
			`response = await client.api(filePath).get();`
			`return response;`
			`} else if (fileType === '.csv') {`
			`response = await client.api(filePath).getStream();`
			`let chunks = [];`
			`for await (let chunk of response) {`
			`chunks.push(chunk);`
			`}`
			`let buffer = Buffer.concat(chunks);`
			`let dataString = buffer.toString('utf-8');`
			`return dataString`

			`} else {`
			`return 'Unsupported File Type';`
			`}`

			`} catch (error) {`
			`console.error('Error fetching drive content:', error);`
			throw new Error(`Failed to fetch content for ${name}: ${error.message}`);
			`}`
			`};`

Aaronwilkowitz/sharepoint (#1299) Co-authored-by: maxreid-openai <max.reid@openai.com> 2024-07-22 15:23:47 -04:00			`// Function to get relevant parts of text using got-4o-mini.`
ChatGPT folder + Azure Function example (#1231) 2024-06-03 05:10:15 -04:00			`const getRelevantParts = async (text, query) => {`
			`try {`
			`// We use your OpenAI key to initialize the OpenAI client`
			`const openAIKey = process.env["OPENAI_API_KEY"];`
			`const openai = new OpenAI({`
			`apiKey: openAIKey,`
			`});`
			`const response = await openai.chat.completions.create({`
Fixes to Sharepoint (Return Text) GPT Action Page (#1308) Co-authored-by: Aaron Wilkowitz <157151487+aaronwilkowitz-openai@users.noreply.github.com> 2024-07-25 15:41:27 +01:00			`// Using gpt-4o-mini due to speed to prevent timeouts. You can tweak this prompt as needed`
			`model: "gpt-4o-mini",`
ChatGPT folder + Azure Function example (#1231) 2024-06-03 05:10:15 -04:00			`messages: [`
			`{"role": "system", "content": "You are a helpful assistant that finds relevant content in text based on a query. You only return the relevant sentences, and you return a maximum of 10 sentences"},`
			{"role": "user", "content": `Based on this question: "${query}", get the relevant parts from the following text:***\n\n${text}***. If you cannot answer the question based on the text, respond with 'No information provided'`}
			`],`
			`// using temperature of 0 since we want to just extract the relevant content`
			`temperature: 0,`
			`// using max_tokens of 1000, but you can customize this based on the number of documents you are searching.`
			`max_tokens: 1000`
			`});`
			`return response.choices[0].message.content;`
			`} catch (error) {`
			`console.error('Error with OpenAI:', error);`
			`return 'Error processing text with OpenAI' + error;`
			`}`
			`};`

			`//// --------- AZURE FUNCTION LOGIC ---------`
			`// Below is what the Azure Function executes`
			`module.exports = async function (context, req) {`
			`const query = req.query.query \|\| (req.body && req.body.query);`
			`const searchTerm = req.query.searchTerm \|\| (req.body && req.body.searchTerm);`
			`if (!req.headers.authorization) {`
			`context.res = {`
			`status: 400,`
			`body: 'Authorization header is missing'`
			`};`
			`return;`
			`}`
			`/// The below takes the token passed to the function, to use to get an OBO token.`
			`const bearerToken = req.headers.authorization.split(' ')[1];`
			`let accessToken;`
			`try {`
			`accessToken = await getOboToken(bearerToken);`
			`} catch (error) {`
			`context.res = {`
			`status: 500,`
			body: `Failed to obtain OBO token: ${error.message}`
			`};`
			`return;`
			`}`
			`// Initialize the Graph Client using the initGraphClient function defined above`
			`let client = initGraphClient(accessToken);`
			`// this is the search body to be used in the Microsft Graph Search API: https://learn.microsoft.com/en-us/graph/search-concept-files`
			`const requestBody = {`
			`requests: [`
			`{`
			`entityTypes: ['driveItem'],`
			`query: {`
			`queryString: searchTerm`
			`},`
			`from: 0,`
			`// the below is set to summarize the top 10 search results from the Graph API, but can configure based on your documents.`
			`size: 10`
			`}`
			`]`
			`};`

			`try {`
			`// Function to tokenize content (e.g., based on words).`
			`const tokenizeContent = (content) => {`
			`return content.split(/\s+/);`
			`};`

Aaronwilkowitz/sharepoint (#1299) Co-authored-by: maxreid-openai <max.reid@openai.com> 2024-07-22 15:23:47 -04:00			`// Function to break tokens into 10k token windows for got-4o-mini`
ChatGPT folder + Azure Function example (#1231) 2024-06-03 05:10:15 -04:00			`const breakIntoTokenWindows = (tokens) => {`
			`const tokenWindows = []`
			`const maxWindowTokens = 10000; // 10k tokens`
			`let startIndex = 0;`

			`while (startIndex < tokens.length) {`
			`const window = tokens.slice(startIndex, startIndex + maxWindowTokens);`
			`tokenWindows.push(window);`
			`startIndex += maxWindowTokens;`
			`}`

			`return tokenWindows;`
			`};`
			`// This is where we are doing the search`
			`const list = await client.api('/search/query').post(requestBody);`

			`const processList = async () => {`
Aaronwilkowitz/sharepoint (#1299) Co-authored-by: maxreid-openai <max.reid@openai.com> 2024-07-22 15:23:47 -04:00			`// This will go through and for each search response, grab the contents of the file and summarize with got-4o-mini`
ChatGPT folder + Azure Function example (#1231) 2024-06-03 05:10:15 -04:00			`const results = [];`

			`await Promise.all(list.value[0].hitsContainers.map(async (container) => {`
			`for (const hit of container.hits) {`
			`if (hit.resource["@odata.type"] === "#microsoft.graph.driveItem") {`
			`const { name, id } = hit.resource;`
			`// We use the below to grab the URL of the file to include in the response`
			`const webUrl = hit.resource.webUrl.replace(/\s/g, "%20");`
			`// The Microsoft Graph API ranks the reponses, so we use this to order it`
			`const rank = hit.rank;`
			`// The below is where the file lives`
			`const driveId = hit.resource.parentReference.driveId;`
			`const contents = await getDriveItemContent(client, driveId, id, name);`
			`if (contents !== 'Unsupported File Type') {`
			`// Tokenize content using function defined previously`
			`const tokens = tokenizeContent(contents);`

			`// Break tokens into 10k token windows`
			`const tokenWindows = breakIntoTokenWindows(tokens);`

			`// Process each token window and combine results`
			`const relevantPartsPromises = tokenWindows.map(window => getRelevantParts(window.join(' '), query));`
			`const relevantParts = await Promise.all(relevantPartsPromises);`
			`const combinedResults = relevantParts.join('\n'); // Combine results`

			`results.push({ name, webUrl, rank, contents: combinedResults });`
			`}`
			`else {`
			`results.push({ name, webUrl, rank, contents: 'Unsupported File Type' });`
			`}`
			`}`
			`}`
			`}));`

			`return results;`
			`};`
			`let results;`
			`if (list.value[0].hitsContainers[0].total == 0) {`
			`// Return no results found to the API if the Microsoft Graph API returns no results`
			`results = 'No results found';`
			`} else {`
			`// If the Microsoft Graph API does return results, then run processList to iterate through.`
			`results = await processList();`
			`results.sort((a, b) => a.rank - b.rank);`
			`}`
			`context.res = {`
			`status: 200,`
			`body: results`
			`};`
			`} catch (error) {`
			`context.res = {`
			`status: 500,`
			body: `Error performing search or processing results: ${error.message}`,
			`};`
			`}`
			`};`