# 爬取有关软件工作的信息
import requests
from pymysql import connect
from bs4 import BeautifulSoup
# 定义数据库的连接函数
conn = connect(user="root", password="root", host="localhost", database="python", charset="utf8")
cursor = conn.cursor()
# 获取工作数据
def get_html_resources(url, headers):
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
print("获取网站源码出错........")
# 解析网站源码
def parse_detail_page(html, headers, table, cursor):
parttern = re.compile('engine_search_result":(.*?)</script>', re.S)
items = re.findall(parttern, html)
parttern_detail = re.compile(
'job_href":"(.*?)","job_name":"(.*?)".*?company_name":"(.*?)","providesalary_text":"(.*?)".*?attribute_text":(.*?),"companysize_text',
re.S)
items_detail = re.findall(parttern_detail, items[0])
for item in items_detail:
address = []
education = []
content = []
job_url = str(item[0]).replace("\\", "")
job_address_education = str(item[4]).replace('["', "").replace('"]', "").replace('"', "").split(",")
if len(job_address_education) == 4:
address.append(job_address_education[0])
education.append(job_address_education[2])
if len(job_address_education) == 3:
address.append(job_address_education[0])
education.append(job_address_education[1])
# 开始获取详情页的工作数据
response = requests.get(job_url, headers=headers)
response.encoding = "gbk"
try:
if response.status_code == 200:
detail_html = response.text
soup = BeautifulSoup(detail_html, "lxml")
job_request = soup.find("div", class_="bmsg job_msg inbox").text
content.append(job_request)
else:
print("获取详情页的信息错误")
pass
except:
pass
yield {
"工作名称": item[1],
"公司名称": item[2],
"工作待遇": item[3],
"工作地点": address[0],
"学历要求": education[0],
"工作要求": content[0],
}
try:
sql = "insert into " + str(
table) + "(job_name,company_name,salary,job_address,education,job_require) values ('" + item[
1] + "','" + item[2] + "','" + item[3] + "','" + address[0] + "','" + education[0] + "','" + \
content[0] + "');"
cursor.execute(sql)
conn.commit()
except:
print("数据插入异常............")
conn.rollback()
return items
def main():
# 创建headers信息
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
}
# 创建数据表列表
table_list = ["android", "nature_language", "deep_learning", "computer_vision", "big_data", "machine_learning",
"production_manager", "education", "finance", "service", "transportation"]
job_name_list = ["android开发", "自然语言处理", "深度学习", "计算机视觉", "大数据", "机器学习", "产品经理", "教师", "金融", "服务类", "运输"]
# 创建数据库的sql语句
# 定义sql语句用来创建表结构
for table in table_list:
try:
sql = "create table " + str(
table) + "(job_name varchar(200),company_name varchar(300),salary varchar(300),job_address varchar(500),education varchar(100),job_require varchar(5000),min_salary int(11) null,max_salary int(11));"
# 使用cursor执行sql语句进行表结构的创建
cursor.execute(sql)
conn.commit()
except:
print("数据表已存在正......................")
pass
finally:
# 设置关键字开始抓取工作数据
job = job_name_list[table_list.index(table)]
for i in range(2):
url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + str(job) + ",2," + str(
i + 1) + ".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
try:
html = get_html_resources(url, headers)
items = parse_detail_page(html, headers, table, cursor)
for item in items:
print(item)
except:
print(url)
print("获取异常")
pass
if __name__ == "__main__":
main()