爬取软件工程师相关信息

菜鸟小欧
• 阅读 1882

# 爬取有关软件工作的信息

import requests
from pymysql import connect
from bs4 import BeautifulSoup

# 定义数据库的连接函数
conn = connect(user="root", password="root", host="localhost", database="python", charset="utf8")
cursor = conn.cursor()


# 获取工作数据
def get_html_resources(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print("获取网站源码出错........")
# 解析网站源码
def parse_detail_page(html, headers, table, cursor):
    parttern = re.compile('engine_search_result":(.*?)</script>', re.S)
    items = re.findall(parttern, html)
    parttern_detail = re.compile(
        'job_href":"(.*?)","job_name":"(.*?)".*?company_name":"(.*?)","providesalary_text":"(.*?)".*?attribute_text":(.*?),"companysize_text',
        re.S)
    items_detail = re.findall(parttern_detail, items[0])
    for item in items_detail:
        address = []
        education = []
        content = []
        job_url = str(item[0]).replace("\\", "")
        job_address_education = str(item[4]).replace('["', "").replace('"]', "").replace('"', "").split(",")
        if len(job_address_education) == 4:
            address.append(job_address_education[0])
            education.append(job_address_education[2])
        if len(job_address_education) == 3:
            address.append(job_address_education[0])
            education.append(job_address_education[1])

        # 开始获取详情页的工作数据
        response = requests.get(job_url, headers=headers)
        response.encoding = "gbk"
        try:
            if response.status_code == 200:
                detail_html = response.text
                soup = BeautifulSoup(detail_html, "lxml")
                job_request = soup.find("div", class_="bmsg job_msg inbox").text
                content.append(job_request)
            else:
                print("获取详情页的信息错误")
                pass
        except:
            pass

        yield {
            "工作名称": item[1],
            "公司名称": item[2],
            "工作待遇": item[3],
            "工作地点": address[0],
            "学历要求": education[0],
            "工作要求": content[0],
        }
        try:
            sql = "insert into " + str(
                table) + "(job_name,company_name,salary,job_address,education,job_require) values ('" + item[
                      1] + "','" + item[2] + "','" + item[3] + "','" + address[0] + "','" + education[0] + "','" + \
                  content[0] + "');"
            cursor.execute(sql)
            conn.commit()
        except:
            print("数据插入异常............")
            conn.rollback()

    return items


def main():
    # 创建headers信息
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
    }

    # 创建数据表列表
    table_list = ["android", "nature_language", "deep_learning", "computer_vision", "big_data", "machine_learning",
                  "production_manager", "education", "finance", "service", "transportation"]
    job_name_list = ["android开发", "自然语言处理", "深度学习", "计算机视觉", "大数据", "机器学习", "产品经理", "教师", "金融", "服务类", "运输"]
    # 创建数据库的sql语句
    # 定义sql语句用来创建表结构
    for table in table_list:
        try:
            sql = "create table " + str(
                table) + "(job_name varchar(200),company_name varchar(300),salary varchar(300),job_address varchar(500),education varchar(100),job_require varchar(5000),min_salary int(11) null,max_salary int(11));"
            # 使用cursor执行sql语句进行表结构的创建
            cursor.execute(sql)
            conn.commit()

        except:
            print("数据表已存在正......................")
            pass

        finally:
            # 设置关键字开始抓取工作数据
            job = job_name_list[table_list.index(table)]
            for i in range(2):
                url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + str(job) + ",2," + str(
                    i + 1) + ".html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
                try:
                    html = get_html_resources(url, headers)
                    items = parse_detail_page(html, headers, table, cursor)
                    for item in items:
                        print(item)
                except:
                    print(url)
                    print("获取异常")
                    pass


if __name__ == "__main__":
    main()
点赞
收藏
评论区
推荐文章
blmius blmius
3年前
MySQL:[Err] 1292 - Incorrect datetime value: ‘0000-00-00 00:00:00‘ for column ‘CREATE_TIME‘ at row 1
文章目录问题用navicat导入数据时,报错:原因这是因为当前的MySQL不支持datetime为0的情况。解决修改sql\mode:sql\mode:SQLMode定义了MySQL应支持的SQL语法、数据校验等,这样可以更容易地在不同的环境中使用MySQL。全局s
皕杰报表之UUID
​在我们用皕杰报表工具设计填报报表时,如何在新增行里自动增加id呢?能新增整数排序id吗?目前可以在新增行里自动增加id,但只能用uuid函数增加UUID编码,不能新增整数排序id。uuid函数说明:获取一个UUID,可以在填报表中用来创建数据ID语法:uuid()或uuid(sep)参数说明:sep布尔值,生成的uuid中是否包含分隔符'',缺省为
Stella981 Stella981
3年前
Python3:sqlalchemy对mysql数据库操作,非sql语句
Python3:sqlalchemy对mysql数据库操作,非sql语句python3authorlizmdatetime2018020110:00:00coding:utf8'''
Wesley13 Wesley13
3年前
4cast
4castpackageloadcsv.KumarAwanish发布:2020122117:43:04.501348作者:KumarAwanish作者邮箱:awanish00@gmail.com首页:
Stella981 Stella981
3年前
Python之time模块的时间戳、时间字符串格式化与转换
Python处理时间和时间戳的内置模块就有time,和datetime两个,本文先说time模块。关于时间戳的几个概念时间戳,根据1970年1月1日00:00:00开始按秒计算的偏移量。时间元组(struct_time),包含9个元素。 time.struct_time(tm_y
Wesley13 Wesley13
3年前
mysql设置时区
mysql设置时区mysql\_query("SETtime\_zone'8:00'")ordie('时区设置失败,请联系管理员!');中国在东8区所以加8方法二:selectcount(user\_id)asdevice,CONVERT\_TZ(FROM\_UNIXTIME(reg\_time),'08:00','0
Wesley13 Wesley13
3年前
thinkphp 基本配置
12returnarray(34//定义数据库连接信息5'DB\_TYPE''mysql',//指定数据库是mysql67'DB\_HOST''localhost',89'DB\_NAME''uchome',//数据库名1011'DB\_USER''root
Wesley13 Wesley13
3年前
MySQL部分从库上面因为大量的临时表tmp_table造成慢查询
背景描述Time:20190124T00:08:14.70572408:00User@Host:@Id:Schema:sentrymetaLast_errno:0Killed:0Query_time:0.315758Lock_
Python进阶者 Python进阶者
10个月前
Excel中这日期老是出来00:00:00,怎么用Pandas把这个去除
大家好,我是皮皮。一、前言前几天在Python白银交流群【上海新年人】问了一个Pandas数据筛选的问题。问题如下:这日期老是出来00:00:00,怎么把这个去除。二、实现过程后来【论草莓如何成为冻干莓】给了一个思路和代码如下:pd.toexcel之前把这
菜鸟小欧
菜鸟小欧
Lv1
夜闻归雁生乡思,病入新年感物华。
文章
5
粉丝
1
获赞
0