Python script: bulk download from Wikipedia (solving the http error 403)
# save Wikipedia HTML pages (properly named) from a text list of URLs
# NOTE: usea quotes to escape / . e.g: https://en.wikipedia.org/wiki/V/H/S
from bs4 import BeautifulSoup
import urllib.request
failed = []
with open('wk.txt') as f:
for line in f:
if "file:///wiki/" in line:
line = line.replace("file:///wiki/","https://en.wikipedia.org/wiki/").strip()
if "http" in line.lower():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
req = urllib.request.Request(line, headers=headers)
try:
with urllib.request.urlopen(req) as response:
page = response.read().decode('utf-8')
soup = BeautifulSoup(page, "lxml")
page_title = soup.title.string
with open(page_title + ".html", 'w') as file_to_save:
file_to_save.write(page)
except urllib.error.HTTPError as e:
# except urllib.error.URLError as e:
print(f"HTTP Error: {e.code} - {e.reason}")
print(line)
failed.append(line)
continue
if failed:
with open("err.txt", "w") as err_file:
for i in failed:
err_file.write(i)
تعليقات
إرسال تعليق