diff --git a/apps/web-crawl-q-and-a/web-qa.py b/apps/web-crawl-q-and-a/web-qa.py index 7355a5d..68eceea 100644 --- a/apps/web-crawl-q-and-a/web-qa.py +++ b/apps/web-crawl-q-and-a/web-qa.py @@ -20,6 +20,9 @@ from ast import literal_eval # Regex pattern to match a URL HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$' +# Define OpenAI api_key +# openai.api_key = '' + # Define root domain to crawl domain = "openai.com" full_url = "https://openai.com/" @@ -136,22 +139,26 @@ def crawl(url): # Get the next URL from the queue url = queue.pop() print(url) # for debugging and to see the progress + + # Try extracting the text from the link, if failed proceed with the next item in the queue + try: + # Save text from the url to a .txt file + with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f: - # Save text from the url to a .txt file - with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f: + # Get the text from the URL using BeautifulSoup + soup = BeautifulSoup(requests.get(url).text, "html.parser") - # Get the text from the URL using BeautifulSoup - soup = BeautifulSoup(requests.get(url).text, "html.parser") + # Get the text but remove the tags + text = soup.get_text() - # Get the text but remove the tags - text = soup.get_text() - - # If the crawler gets to a page that requires JavaScript, it will stop the crawl - if ("You need to enable JavaScript to run this app." in text): - print("Unable to parse page " + url + " due to JavaScript being required") + # If the crawler gets to a page that requires JavaScript, it will stop the crawl + if ("You need to enable JavaScript to run this app." in text): + print("Unable to parse page " + url + " due to JavaScript being required") - # Otherwise, write the text to the file in the text directory - f.write(text) + # Otherwise, write the text to the file in the text directory + f.write(text) + except Exception as e: + print("Unable to parse page " + url) # Get the hyperlinks from the URL and add them to the queue for link in get_domain_hyperlinks(local_domain, url):