mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
Catch the exception thrown by the With.Open and continue with the queue (#155)
Co-authored-by: Ashok Manghat <amanghat@rmplc.net>
This commit is contained in:
parent
18b7ee5f37
commit
39b62a6c09
@ -20,6 +20,9 @@ from ast import literal_eval
|
|||||||
# Regex pattern to match a URL
|
# Regex pattern to match a URL
|
||||||
HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
|
HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
|
||||||
|
|
||||||
|
# Define OpenAI api_key
|
||||||
|
# openai.api_key = '<Your API Key>'
|
||||||
|
|
||||||
# Define root domain to crawl
|
# Define root domain to crawl
|
||||||
domain = "openai.com"
|
domain = "openai.com"
|
||||||
full_url = "https://openai.com/"
|
full_url = "https://openai.com/"
|
||||||
@ -136,22 +139,26 @@ def crawl(url):
|
|||||||
# Get the next URL from the queue
|
# Get the next URL from the queue
|
||||||
url = queue.pop()
|
url = queue.pop()
|
||||||
print(url) # for debugging and to see the progress
|
print(url) # for debugging and to see the progress
|
||||||
|
|
||||||
|
# Try extracting the text from the link, if failed proceed with the next item in the queue
|
||||||
|
try:
|
||||||
|
# Save text from the url to a <url>.txt file
|
||||||
|
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
|
||||||
|
|
||||||
# Save text from the url to a <url>.txt file
|
# Get the text from the URL using BeautifulSoup
|
||||||
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
|
soup = BeautifulSoup(requests.get(url).text, "html.parser")
|
||||||
|
|
||||||
# Get the text from the URL using BeautifulSoup
|
# Get the text but remove the tags
|
||||||
soup = BeautifulSoup(requests.get(url).text, "html.parser")
|
text = soup.get_text()
|
||||||
|
|
||||||
# Get the text but remove the tags
|
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
|
||||||
text = soup.get_text()
|
if ("You need to enable JavaScript to run this app." in text):
|
||||||
|
print("Unable to parse page " + url + " due to JavaScript being required")
|
||||||
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
|
|
||||||
if ("You need to enable JavaScript to run this app." in text):
|
|
||||||
print("Unable to parse page " + url + " due to JavaScript being required")
|
|
||||||
|
|
||||||
# Otherwise, write the text to the file in the text directory
|
# Otherwise, write the text to the file in the text directory
|
||||||
f.write(text)
|
f.write(text)
|
||||||
|
except Exception as e:
|
||||||
|
print("Unable to parse page " + url)
|
||||||
|
|
||||||
# Get the hyperlinks from the URL and add them to the queue
|
# Get the hyperlinks from the URL and add them to the queue
|
||||||
for link in get_domain_hyperlinks(local_domain, url):
|
for link in get_domain_hyperlinks(local_domain, url):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user