嘿嘿嘿,py真好玩
代码写得很渣,本着能跑就好的原则,就这样啦~
import os
from requests_html import HTMLSession
import sqlite3
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
domain = "https://www.helloworld.net"
class Helloworld:
def __init__(self):
self.con = sqlite3.connect(str(BASE_DIR) + "\\data\\helloworld.db", timeout=10)
# 引入sqlite3 是想存储到数据库来着
self.session = HTMLSession()
def getContent(self, p, saveToFile=False):
req = self.session.get(domain + p)
p = p.split('/')
if req.status_code != 200:
print("获取失败")
return False
author = req.html.find(".author-info", first=True)
data = {}
data['pubtime'] = author.xpath('//span[@title="发布时间"]//time/text()')[0].strip()
data['yuedushu'] = author.xpath('//span[@title="阅读数"]/text()')[0].strip()
data['zan'] = author.xpath('//span[@title="点赞数"]/text()')[0].strip()
data['shoucang'] = author.xpath('//span[@title="收藏数"]/text()')[0].strip()
data['content'] = req.html.find("article>div[id='htmlContent']", first=True).html
print(data)
if saveToFile:
data['save_path'] = os.path.join(BASE_DIR, "data") + p[2] + ".html"
# 创建文件
f = open(data['save_path'], 'wb')
f.write(bytes(data['content'], encoding="utf8"))
f.close()
return data
def getArticleList(self, hot=False, callback=None):
url = domain
if hot:
# 是否按照热门爬取
url += "?sort = hottest"
req = self.session.get(url)
if req.status_code != 200:
print("获取失败")
return False
blog_list_container = req.html.find(".blog_list_container>div.blog-item")
dataSet = []
for item in blog_list_container:
data = {}
author = {}
leftItem = item.find(".item-left", first=True)
data['p'] = leftItem.xpath('//a/@href')[0]
data['short_desc'] = leftItem.find("p", first=True).text
author['path'] = leftItem.find(".infos .pre-infos", first=True).xpath("//a/@href")[0]
author['portrait'] = leftItem.find(".infos .pre-infos a", first=True).xpath("//img/@src")[0]
author['nickname'] = leftItem.find(".infos .pre-infos .nickname", first=True).text
data['time'] = leftItem.find(".infos .pre-infos .time", first=True).text
data['yuedushu'] = \
leftItem.find(".infos .after-infos", first=True).xpath(
'//div[@title="阅读数"]//span[@class="name"]/text()')[
0].strip()
data['zan'] = \
leftItem.find(".infos .after-infos", first=True).xpath(
'//div[@title="点赞数"]//span[@class="name"]/text()')[
0].strip()
rightItem = item.find(".item-right", first=True)
data['cover_image'] = ''
if rightItem:
# 获取封面
data['cover_image'] = rightItem.xpath("//img/@src")
if data['cover_image']:
data['cover_image'] = data['cover_image'][0]
data['author'] = author
# 获取文章内容部分
dataSet.append(data)
if callback:
callback(data)
print(data)
return dataSet
def mycallback(data):
c = Helloworld().getContent(p=data.p, saveToFile=False)
# 嘿嘿嘿
if __name__ == '__main__':
Helloworld().getArticleList(callback=mycallback)
# c = Helloworld().getContent(p='/p/14G4HKmhx2FLe')
文章有个分页的问题,可以用selenium解决吧