Python 使用selenium抓取网页文本和下载音频
#!\usr\bin\env python
# -*- coding: utf-8 -*-
'一个自动从https://podcast.duolingo.com/spanish中下载音频并且爬取文本的程序'
'需要配置下载以下所需库,并且配置好webdriver.Chrome(),否则报错'
from selenium import webdriver
import requests
import re
import os
import shelve
def mainProc():
'主进程'
db = openDb()
get_pages(db)
get_episodes(db)
db.close()
def openDb():
'打开data文件,如果当前路径不存在,则新建文件并初始化'
filename = "data.dat"
if not os.path.exists(filename):
db = shelve.open("data", writeback=True)
db["pages"] = []
db["episodes"] = []
else:
db = shelve.open("data", writeback=True)
return db
def get_pages(db):
'遍历获取所有页面的网址并保存到shelve文件中'
# 主页面
main = 'https://podcast.duolingo.com/spanish'
# 循环遍历获取所有页面的网址
# 第一页则为主页面,不需要在main末尾添加i
#'https://podcast.duolingo.com/spanish2' 以此类推"
# 如果页面没有在文件中存在,则尝试访问页面,如果200成功,写入文本
for i in range(1, 100):
page = main if i == 1 else main + str(i)
if not page in db["pages"]:
r = requests.get(page)
print(f'{page} with status code {r.status_code}.')
if r.status_code != 200:
break
db["pages"].append(page)
# 获取页面所有节目链接并补全连接
episodes = re.findall('entry-title">\s*<a href="(.*)" rel', r.text)
for episode in episodes:
episode = str(main[:-7]) + str(episode[2:])
db["episodes"].append(episode)
def get_episodes(db):
'在每一页中遍历所有的单集网址'
for episode in db["episodes"]:
r = requests.get(episode)
print(f'{episode} with status code {r.status_code}.')
if r.status_code != 200:
continue
# 将页面的文本写入文件中并下载音频
get_transcript(episode)
get_audios(r, episode)
def get_transcript(episode):
# 获取节目单集网址中的文本
filename = 'transcript/' + episode.split('/')[-1] + '.txt'
if os.path.exists(filename):
print(filename, 'existed!')
else:
req = requests.get(episode)
print('{episode} with status code {status}.'.format(episode=episode, status=req.status_code))
if not os.path.exists('transcript'):
os.mkdir('transcript')
with open(filename, 'w+', encoding="utf-8") as fp:
for lines in re.findall('strong>(.*)</strong>(.*)</p>', req.text):
for line in lines:
fp.write(line)
fp.write('\n\n')
print(filename, 'added!')
def get_audios(r, episode):
audio = "https:" + re.findall('<iframe .* src="(.*)" height', r.text)[0]
# 自定义下载配置
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--ignore-certificate-errors")
prefs = {"download.default_directory":r"E:\Python\code\project\duolingo\audio"}
chromeOptions.add_experimental_option("prefs", prefs)
# 下载文件
print(audio)
browser = webdriver.Chrome(chrome_options=chromeOptions)
browser.get(audio)
if not os.path.exists("audio"):
os.mkdir("audio")
browser.find_element_by_id('download-player').click()
download_status = False
while not download_status:
download_status = True
for i in os.listdir('audio'):
if i.endswith(".crdownload"):
download_status = False
time.sleep(5)
browser.close()
if __name__ == "__main__":
mainProc()