Baidu音乐爬虫

Stella981
• 阅读 605

Baidu音乐歌曲爬虫:

1、分析Baidu音乐歌曲下载接口,组装参数

2、判断是否需要登录

  a、使用cookie

  b、使用selenium

3、歌曲信息页面分析

4、数据表设计

歌曲类型表

Baidu音乐爬虫

歌曲表

Baidu音乐爬虫

 表都无所谓,自己设计就行。

-------------------------------

# -*- coding: utf-8 -*-
'''
    ***
        _author_= "fengshaungzi"
        _time_='2018-4-10'
        _python_version_ = 'python2.7'
        _script_type_ = 'spider'
        url = 'http://music.baidu.com/tag/类型?start=0&size=20&third_type=0'
    ***
'''
from os import path
from bs4 import BeautifulSoup
import urllib,urllib2,requests,cookielib
import sys,time,datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql,shutil
import sys,os
reload(sys)
sys.setdefaultencoding('utf-8')
d = path.dirname(__file__)

class BadiuMusicSpider():
    def __init__(self):
        pass
    def login(self,cursor,type_id,type_q):
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome()
        driver.maximize_window()
        driver.get("http://i.baidu.com/welcome/")
        time.sleep(5)
        driver.find_element_by_xpath('/html/body/header/div/div/a[2]').click()
        time.sleep(2)
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__userName"]').clear()
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__userName"]').send_keys('用户')
        time.sleep(2)
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__password"]').clear()
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__password"]').send_keys('密码')
        ##如果有验证码
        time.sleep(3)
        try:
            driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__verifyCodeChange"]').click()
            input = raw_input(u'请输入验证码:')
            code = driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__verifyCode"]')
            code.clear()
            code.send_keys(input)
        except:
            print u'没有验证码。'
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__submit"]').submit()
        time.sleep(2)
        self.parse_html(driver,cursor,type_id,type_q)
    def parse_html(self,driver,cursor,type_id,type_q,page=1,):

        #response =  urllib2.urlopen(url).read()
        #response = opener.open(urllib2.Request(url, headers=headers))
        #response = response.read()
        #response = requests.get(url, headers=headers, cookies=cookies).content
        #response = opener.open(urllib2.Request(url, headers=headers))
        #response = response.read()

        start = (page-1)*20
        print u'---开始获取第{0}页的数据----'.format(page)
        url = 'http://music.baidu.com/tag/{0}?start={1}&size=20&third_type=0'.format(type_q,start)
        driver.get(url)
        time.sleep(2)
        response = driver.page_source
        obj = BeautifulSoup(response, 'html.parser')
        ##获取歌曲m_url
        span_list = obj.find_all('span',{"class":"song-title"})
        ## 判断下是否有下一页
        try:
            driver.find_element_by_class_name('page-navigator-next')
            next_page = 1
        except:
            next_page = 0
        #try:
        for v in span_list:
            list = []
            try:
                m_url = v.find('a')['href']
            except:
                continue
            ###获取song_id
            song_id = m_url.replace('/song/', '')
            ##组装下url头部
            m_url = 'http://music.baidu.com{0}'.format(m_url)
            ###开始获取歌曲信息
            data = self.save_music_info(m_url,type_id)
            ### 判断data['check']==0,说明歌曲已经存在跳出这次循环
            if data.has_key('check'):
                print u'---该歌曲已经存在---'
                continue
            singer_path = u"G:\\www\\music2\\"+data['singer']
            ###歌曲信息获取完毕开始下载歌曲 需要song_id
            music_lrc = self.save_music_lrc(driver,song_id,singer_path)
            if  music_lrc.has_key('words') and music_lrc['words'] =='暂无':
                data['words'] =''
            else:
                print u"歌词:"+music_lrc['lrc_name']
                data['words'] = u'music2/LRC/'+music_lrc['lrc_name']
            data['filepath'] = u'music2/{0}/{1}.mp3'.format(data['singer'],data['name'])
            ## 设置id的值
            cursor.execute('select  id from network_music order by cast(id as SIGNED INTEGER) desc limit 0,1')
            old_id = cursor.fetchone()
            if old_id:
                id_n = str(int(old_id[0]) + 1)
            else:
                id_n = str(1)
            # 进入数据库
            list = [(id_n,data['name'],data['singer'],data['album'],data['publishtime'],data['publishcompany'],data['composer'],data['lyrics'], \
                data['filesize'],data['filetime'],data['userhead'],data['types'],data['status'],data['words'],data['filepath'])]
            #xprint list
            self.save_db(cursor,list)
        '''
        except:
            ## 记入log
            try:
                datetime_now = datetime.datetime.now()
                datetime_str = '{0}-{1}-{2} {3}:{4}:{5}'.format(datetime_now.year,datetime_now.month,datetime_now.day,datetime_now.hour,datetime_now.minute,datetime_now.second)
                effect_row = cursor.executemany("insert into music_log(page,datetime)values(%s,%s)",[(page,datetime_str)])
                ## 提交,不然无法保存新建或者修改的数据
                conn.commit()
            except:
                print 'Add log fault!'
        '''
        page = page + 1
        #input = raw_input('输入任意值继续执行:')
        if next_page==1:
            print u'------开始获取下一页的数据----'
            self.parse_html(driver,cursor,type_id,type_q,page=page)
        else:
            print u"-----爬虫程序即将结束-----"
            cursor.close()
            conn.close()

    def save_music_info(self,m_url,type_id):
        data = {}
        music_info_response = urllib2.urlopen(m_url).read()
        music_info_obj = BeautifulSoup(music_info_response, 'html.parser')
        ##获取歌曲信息  name  singer alnum  pubdate pic  tag  company
        name =  music_info_obj.find('span',{"class":"name"}).text.strip()
        name = name.replace('"','')
        name = name.replace("'",'')
        singer = music_info_obj.find('span',{"class":"artist"}).find('a').text.strip()
        singer = singer.replace('"', '')
        singer = singer.replace("'", '')
        if os.path.exists("G:\\www\\music2\\"+singer) == False:
            os.mkdir("G:\\www\\music2\\"+singer)
        else:
            print u'歌手文件夹已经存在!'
        album = music_info_obj.find('p',{"class":"album"}).find('a').text.strip()
        ##发布时间需要处理; 排除空白的情况
        if music_info_obj.find('p',{"class":"publish"}).text.strip() ==u'发行时间:':
            publishtime = '未知'
        else:
            publishtime = music_info_obj.find('p',{"class":"publish"}).text.strip()
            publishtime = publishtime.replace(u'发行时间:','')
        ##发行公司需要处理;排除空白的情况
        if music_info_obj.find('p',{"class":"company"}).text.strip() ==u'发行公司:':
            publishcompany = '未知'
        else:
            publishcompany = music_info_obj.find('p',{"class":"company"}).text.strip()
            publishcompany = publishcompany.replace(u'发行公司:','')

        ###获取图片
        pic_url = music_info_obj.find('img',{"class":"music-song-ing"})['src']
        if pic_url:
            pic_path = self.save_pic(pic_url)
        data['name'] = name
        print u"歌名:"+name
        data['singer'] = singer
        print u"歌手:" + singer
        data['album'] = album
        data['publishtime'] =publishtime
        data['publishcompany'] = publishcompany
        data['composer']  = ''
        data['lyrics'] = ''
        data['filesize'] = ''
        data['filetime'] = 0
        data['userhead'] = pic_path if pic_path else ''
        data['types'] = ','+str(type_id)+','
        data['status'] = 0
        ## 判断数据库是否重复
        #print 'select id,TYPES from network_music where NAME="{0}" and SINGER="{1}"'.format(name,singer)
        cursor.execute('select id,TYPES from network_music where NAME="{0}" and SINGER="{1}"'.format(name,singer))
        result_types = cursor.fetchall()
        if result_types:
            if str(type_id) in result_types[0][1]:
                pass
            else:
                types = result_types[0][1] + str(type_id)+','
                cursor.execute("UPDATE network_music SET TYPES='{0}' WHERE id ={1}".format(types, result_types[0][0]))
                ## 提交,不然无法保存新建或者修改的数据
                conn.commit()
            data['check'] = 0
        return data

    def save_music_lrc(self, driver,song_id,singer_path):
        music_lrc = {}
        m_api = 'http://music.baidu.com/data/music/file?link=&song_id={0}'.format(song_id)
        driver.get(m_api)
        time.sleep(3)
        ### 找到最新的文件
        path_d = u'C:\\Users\\hz\\Downloads'
        file_lists = os.listdir(path_d)
        try:
            file_lists.sort(key=lambda fn: os.path.getmtime(path_d + "\\" + fn))
            filename = file_lists[-1]
            if filename:
                #print filename
                #print singer_path
                ### 移动到
                shutil.move(u'C:\\Users\\hz\\Downloads\\'+filename,singer_path)
        except:
            #os.remove(my_file)
            print u"移动失败,文件名字问题,手动修改"
        ##跳转到页面
        driver.get('http://music.baidu.com/song/{0}'.format(song_id))
        time.sleep(2)
        try:
            l_api = driver.find_element_by_xpath('//*[@id="lyricCont"]').get_attribute('data-lrclink')
            driver.get(l_api)
            time.sleep(2)
            try:
                music_lrc['lrc_name'] = self.get_lrc_path()
            except:
                print u'获取歌词文件名错误'
        except:
            music_lrc['words'] = '暂无'
            print u'没有歌词'
        return music_lrc

    def save_db(self,cursor,list):
        print list
        try:
            effect_row = cursor.executemany("insert into network_music(ID,NAME,SINGER,ALBUM,PUBLISHTIME,PUBLISHCOMPANY,COMPOSER,LYRICS, \
                FILESIZE,FILETIME,USERHEAD,TYPES,STATUS,WORDS,FILEPATH)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ", list)
            ## 提交,不然无法保存新建或者修改的数据
            conn.commit()
        except:
            print 'Add this db fault!'

    def save_pic(self, pic_url, save_path=''):
        ##组装成接口
        pic_list = ['.jpg@','.png@','.jpeg@','.JPG@','.PNG@','.JPEG@']
        for v in pic_list:
            #print  pic_url
            if v in pic_url:
                check = 1
            else:
                endname = '.errorpic'
        if 'check' in vars() and check == 1:
            endname = v.replace('@', '')
        #print endname,pic_url
        save_path = path.join(d, 'music2/USERHEAD/')
        ###名字暂用时间戳
        picName = int(time.time())
        savepic = save_path + str(picName) + endname
        try:
            urllib.urlretrieve(pic_url, savepic)
            return 'music2/USERHEAD/' + str(picName) + endname
        except:
            return 'no'

    def get_lrc_path(self):
        path_d = u'C:\\Users\\hz\\Downloads'
        file_lists = os.listdir(path_d)
        file_lists.sort(key=lambda fn: os.path.getmtime(path_d + "\\" + fn))
        lrc_name =  file_lists[-1]
        '''
        if lrc_name:
            shutil.move(u'C:\\Users\\hz\\Downloads\\' + lrc_name, u'G:\\www\\music2\\LRC\\')
        '''
        return lrc_name

    '''

    def auto_down1(self, url, filename):
        try:
            urllib.urlretrieve(url, filename)
        except urllib.ContentTooShortError:
            print 'Network conditions is not good.Reloading.'
            auto_down(url, filename)

    def auto_down2(self, url, filename):
        ##加载cookies
        raw_cookies = "PSTM=1523331116; BIDUPSID=6598753517A81D738FD546C2D96EDAC5; BAIDUID=E5EE59A93C8788A953248CD76BEBD48D:FG=1; H_PS_PSSID=1425_18194_21127_26182_20928; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; PHPSESSID=bae76nl31pln7r47vi3i1o9jh7; Hm_lvt_4010fd5075fcfe46a16ec4cb65e02f04=1523420559,1523420572; PSINO=2; Hm_lpvt_4010fd5075fcfe46a16ec4cb65e02f04=1523425208"
        cookies = {}
        for line in raw_cookies.split(';'):
            key, value = line.split('=', 1)  # 1代表只分一次,得到两个数据
            cookies[key] = value
        r = requests.get(url, stream=True,cookies = cookies )
        f = open(filename, "wb")
        for chunk in r.iter_content(chunk_size=512):
            if chunk:
                f.write(chunk)
        f.close()

    def auto_down3(self, url, filename):
        cookie = cookielib.MozillaCookieJar()
        cookie.load('c.txt', ignore_expires=True, ignore_discard=True)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
        urllib2.install_opener(opener)
        music = urllib2.urlopen(url).read()
        f = open(filename,'wb')
        f.write(music)
        f.close()
    '''

if __name__ == "__main__":
    print r'Starting....'
    for i in range(5):
        sys.stdout.write('>'*i + '\n')
        sys.stdout.flush()
        time.sleep(0.5)
    conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8")
    # 创建指针
    cursor = conn.cursor()
    type = raw_input(r'请输入歌曲的类型: ').strip()
    ## 加入数据库
    ## 先判断值是否存在
    result = cursor.execute("select id from network_type where RESOURCETYPE='m' and TYPENAME='{0}'".format(type))
    if result == 0:
        print u'-----该类型不存在添加至数据库-------'
        effect_row = cursor.executemany("insert into network_type(PID,RESOURCETYPE,TYPENAME)values(%s,%s,%s)", [(-1,'m',type)])
        type_id = int(cursor.lastrowid)
    else:
        print u'-----该类型存在不需要添加至数据库-------'
        type_val= cursor.fetchall()
        type_id = type_val[0][0]
    ## 提交,不然无法保存新建或者修改的数据
    conn.commit()
    type_q = urllib2.quote(type)
    # 实例
    bmSpider  = BadiuMusicSpider()
    bmSpider.login(cursor,type_id,type_q)

----代码的逻辑

第一步:登录百度,使用selenium(本来我打算用selenium登录之后导出cookie,再通过加载cookie,但是遇到了些问题,再加上工作原因就没有用这个,下次我有空再试,验证码方面,没有设置,遇到验证码关了重启,只要登录成功了,可以爬很久了。)

第二步:输入歌曲类型,默认从第一页开始抓取,接下来就是各种循环,入库啥的,还有文件移动。

总的来说还是比较简单的一个爬虫,不足之处大佬轻喷。

点赞
收藏
评论区
推荐文章
blmius blmius
3年前
MySQL:[Err] 1292 - Incorrect datetime value: ‘0000-00-00 00:00:00‘ for column ‘CREATE_TIME‘ at row 1
文章目录问题用navicat导入数据时,报错:原因这是因为当前的MySQL不支持datetime为0的情况。解决修改sql\mode:sql\mode:SQLMode定义了MySQL应支持的SQL语法、数据校验等,这样可以更容易地在不同的环境中使用MySQL。全局s
皕杰报表之UUID
​在我们用皕杰报表工具设计填报报表时,如何在新增行里自动增加id呢?能新增整数排序id吗?目前可以在新增行里自动增加id,但只能用uuid函数增加UUID编码,不能新增整数排序id。uuid函数说明:获取一个UUID,可以在填报表中用来创建数据ID语法:uuid()或uuid(sep)参数说明:sep布尔值,生成的uuid中是否包含分隔符'',缺省为
待兔 待兔
5个月前
手写Java HashMap源码
HashMap的使用教程HashMap的使用教程HashMap的使用教程HashMap的使用教程HashMap的使用教程22
Jacquelyn38 Jacquelyn38
3年前
2020年前端实用代码段,为你的工作保驾护航
有空的时候,自己总结了几个代码段,在开发中也经常使用,谢谢。1、使用解构获取json数据let jsonData  id: 1,status: "OK",data: 'a', 'b';let  id, status, data: number   jsonData;console.log(id, status, number )
Wesley13 Wesley13
3年前
Java爬虫之JSoup使用教程
title:Java爬虫之JSoup使用教程date:201812248:00:000800update:201812248:00:000800author:mecover:https://imgblog.csdnimg.cn/20181224144920712(https://www.oschin
Stella981 Stella981
3年前
KVM调整cpu和内存
一.修改kvm虚拟机的配置1、virsheditcentos7找到“memory”和“vcpu”标签,将<namecentos7</name<uuid2220a6d1a36a4fbb8523e078b3dfe795</uuid
Wesley13 Wesley13
3年前
mysql设置时区
mysql设置时区mysql\_query("SETtime\_zone'8:00'")ordie('时区设置失败,请联系管理员!');中国在东8区所以加8方法二:selectcount(user\_id)asdevice,CONVERT\_TZ(FROM\_UNIXTIME(reg\_time),'08:00','0
Wesley13 Wesley13
3年前
00:Java简单了解
浅谈Java之概述Java是SUN(StanfordUniversityNetwork),斯坦福大学网络公司)1995年推出的一门高级编程语言。Java是一种面向Internet的编程语言。随着Java技术在web方面的不断成熟,已经成为Web应用程序的首选开发语言。Java是简单易学,完全面向对象,安全可靠,与平台无关的编程语言。
Wesley13 Wesley13
3年前
MySQL部分从库上面因为大量的临时表tmp_table造成慢查询
背景描述Time:20190124T00:08:14.70572408:00User@Host:@Id:Schema:sentrymetaLast_errno:0Killed:0Query_time:0.315758Lock_
Python进阶者 Python进阶者
11个月前
Excel中这日期老是出来00:00:00,怎么用Pandas把这个去除
大家好,我是皮皮。一、前言前几天在Python白银交流群【上海新年人】问了一个Pandas数据筛选的问题。问题如下:这日期老是出来00:00:00,怎么把这个去除。二、实现过程后来【论草莓如何成为冻干莓】给了一个思路和代码如下:pd.toexcel之前把这