一、定义实现随机User-Agent的下载中间件
1.在middlewares.py中完善代码
1 import random
2 from Tencent.settings import USER_AGENTS_LIST # 注意导入路径,请忽视pycharm的错误提示
3
4 class UserAgentMiddleware(object):
5 def process_request(self, request, spider):
6 user_agent = random.choice(USER_AGENTS_LIST)
7 request.headers['User-Agent'] = user_agent
8 # 不写return
9
10 class CheckUA:
11 def process_response(self,request,response,spider):
12 print(request.headers['User-Agent'])
13 return response # 不能少!
2.在settings中设置开启自定义的下载中间件,设置方法同管道
1 DOWNLOADER_MIDDLEWARES = {
2 'Tencent.middlewares.UserAgentMiddleware': 543, # 543是权重值
3 'Tencent.middlewares.CheckUA': 600, # 先执行543权重的中间件,再执行600的中间件
4 }
3.在settings中添加UA的列表
1 USER_AGENTS_LIST = [
2 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
3 "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
4 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
5 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
6 "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
7 ]
二、代理ip的使用
1.在middlewares.py中完善代码
1 class RandomProxy(object):
2
3 def process_request(self, request, spider):
4
5 proxy = random.choice(PROXY_LIST)
6 print(proxy)
7
8 if 'user_passwd' in proxy:
9 # 对账号密码进行编码,基础认证,python3中需要是bytes类型的数据才能编码
10 b64_up = base64.b64encode(proxy['user_passwd'].encode())
11
12 # 进行代理认证
13 request.headers['Proxy-Authorization'] = 'Basic ' + b64_up.decode()
14
15 # 设置代理
16 request.meta['proxy'] = proxy['ip_port']
17 else:
18 #设置代理
19 request.meta['proxy'] = proxy['ip_port']
2.检测代理ip是否可用
在使用了代理ip的情况下可以在下载中间件的process_response()方法中处理代理ip的使用情况,如果该代理ip不能使用可以替换其他代理ip
1 class ProxyMiddleware(object):
2 ......
3 def process_response(self, request, response, spider):
4 if response.status != '200':
5 request.dont_filter = True # 重新发送的请求对象能够再次进入队列
6 return requst
3.在settings中添加代理ip的列表
1 PROXY_LIST = [
2 {"ip_port": "139.199.121.163:16818", "user_passwd": "user:password"},#收费代理
3 # {"ip_port": "114.234.81.72:9000"} # 免费代理
4 ]
三. 在中间件中使用selenium
以github登陆为例
1. 完成爬虫代码
1 import scrapy
2
3 class Login4Spider(scrapy.Spider):
4 name = 'login4'
5 allowed_domains = ['github.com']
6 start_urls = ['https://github.com/returnes'] # 直接对验证的url发送请求
7
8 def parse(self, response):
9 with open('check.html', 'w') as f:
10 f.write(response.body.decode())
2.在middlewares.py中使用selenium获取cookie信息
1 import time
2 from selenium import webdriver
3
4
5 def getCookies():
6 # 使用selenium模拟登陆,获取并返回cookie
7 username = input('输入github账号:')
8 password = input('输入github密码:')
9 options = webdriver.ChromeOptions()
10 options.add_argument('--headless')
11 options.add_argument('--disable-gpu')
12 driver = webdriver.Chrome('/home/worker/Desktop/driver/chromedriver',
13 chrome_options=options)
14 driver.get('https://github.com/login')
15 time.sleep(1)
16 driver.find_element_by_xpath('//*[@id="login_field"]').send_keys(username)
17 time.sleep(1)
18 driver.find_element_by_xpath('//*[@id="password"]').send_keys(password)
19 time.sleep(1)
20 driver.find_element_by_xpath('//*[@id="login"]/form/div[3]/input[3]').click()
21 time.sleep(2)
22 cookies_dict = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()}
23 driver.quit()
24 return cookies_dict
25
26 class LoginDownloaderMiddleware(object):
27
28 def process_request(self, request, spider):
29 cookies_dict = getCookies()
30 print(cookies_dict)
31 request.cookies = cookies_dict # 对请求对象的cookies属性进行替换
3.在middlewares.py中使用selenium获取指定页面渲染后的html源码
1 class SelMiddleWare(object):
2
3 def process_request(self, request, spider):
4
5 url = request.url
6 # 过滤需要渲染的请求对象
7 if 'daydata' in url:
8
9 driver = webdriver.Chrome()
10
11 driver.get(url)
12 time.sleep(3)
13
14 data = driver.page_source
15 driver.close()
16
17 res = HtmlResponse(
18 url=url,
19 body=data,
20 encoding='utf-8',
21 request=request
22 )
23
24 return res