Update web-qa.py

This commit is contained in:
Logan Kilpatrick 2023-02-06 11:28:31 -06:00 committed by GitHub
parent a7b8506c6e
commit a2837aa5cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -135,7 +135,7 @@ def crawl(url):
print(url) # for debugging and to see the progress
# Save text from the url to a <url>.txt file
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="utf-8") as f:
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
# Get the text from the URL using BeautifulSoup
soup = BeautifulSoup(requests.get(url).text, "html.parser")
@ -181,7 +181,7 @@ texts=[]
for file in os.listdir("text/" + domain + "/"):
# Open the file and read the text
with open("text/" + domain + "/" + file, "r", encoding="utf-8") as f:
with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f:
text = f.read()
# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.