Java爬虫技术之绕过百度云防护抓取网站内容

Wesley13
• 阅读 431

如图:

Java爬虫技术之绕过百度云防护抓取网站内容

首先需要一个Http工具类:HttpHandle

package org.coody.robot.util;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;


public class HttpHandle {

    
    public final static String POST="POST";
    public final static String GET="GET";
    public final static String HEAD="HEAD";
    public final static String PUT="PUT";
    public final static String CONNECT="CONNECT";
    public final static String OPTIONS="OPTIONS";
    public final static String DELETE="DELETE";
    public final static String PATCH="PATCH";
    public final static String PROPFIND="PROPFIND";
    public final static String PROPPATCH="PROPPATCH";
    public final static String MKCOL="MKCOL";
    public final static String COPY="COPY";
    public final static String MOVE="MOVE";
    public final static String LOCK="LOCK";
    public final static String UNLOCK ="UNLOCK";
    public final static String TRACE="TRACE";
    
    public final static String HTTP_GENERAL="HTTP_GENERAL";
    
    public final static String HTTP_JSON="HTTP_JSON";
    
    public HttpConfig config=new HttpConfig();
    

    
    public HttpConfig getConfig() {
        return config;
    }

    public void setConfig(HttpConfig config) {
        this.config = config;
    }
    public static class HttpConfig{
        
        private boolean allowRedirects=true;
        
        private String cookie="";
        
        private String encode="UTF-8";
        
        private int timeOut=15;
        
        private String httpModule=HTTP_GENERAL;
        
        private Map<String, String> headerMap=new HashMap<String, String>();
        
        
        public void setEncode(String encode) {
            this.encode = encode;
        }


        public void setTimeOut(int timeOut) {
            this.timeOut = timeOut;
        }


        public void setCookie(String cookie) {
            this.cookie = cookie;
        }


        public void setHeaderMap(Map<String, String> headerMap) {
            this.headerMap = headerMap;
        }

        //设置Header头部
        public void setRequestProperty(String fieldName,String value){
            headerMap.put(fieldName, value);
        }
        //是否开启Gzip
        public void setGzip(boolean isGzip){
            if(isGzip){
                headerMap.put("Accept-Encoding", "gzip, deflate, sdch");
                return;
            }
            headerMap.put("Accept-Encoding", "*");
        }
        //是否保持连接
        public void setKeepAlive(boolean keepAlive){
            if(keepAlive){
                headerMap.put("Connection", "keep-alive");
                return;
            }
            headerMap.put("Connection", "close");
        }
        
        //是否允许重定向
        public void allowRedirects(boolean allowRedirects){
            this.allowRedirects=allowRedirects;
        }
    }
    
    private HttpURLConnection createConnectionGeneral(String url) {
        try {
            HttpURLConnection conn = (HttpURLConnection) new URL(url)
                    .openConnection();
            conn.addRequestProperty("Referer", getDomain(url));
            conn.addRequestProperty(
                    "Accept",
                    "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
            conn.addRequestProperty("Content-type",
                    "application/x-www-form-urlencoded");
            conn.addRequestProperty(
                    "User-Agent",
                    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
            return conn;
        } catch (Exception e) {
            return null;
        }
    }
    
    private HttpURLConnection createConnectionJson(String url) {
        try {
            HttpURLConnection conn = (HttpURLConnection) new URL(url)
                    .openConnection();
            conn.addRequestProperty("Referer", getDomain(url));
            conn.addRequestProperty(
                    "Accept",
                    "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
            conn.addRequestProperty("Content-type",
                    "application/x-www-form-urlencoded");
            conn.addRequestProperty(
                    "User-Agent",
                    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
            return conn;
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }
    
    
    


    
    //获取默认来源地址
    public static String getDomain(String urlStr){
        try {
            URI uri=new URI(urlStr);
            String result=uri.getScheme()+"://"+uri.getHost();
            if(uri.getPort()>0&&uri.getPort()!=80){
                result+=("/"+uri.getPort());
            }
            if(!result.endsWith("/")){
                result+="/";
            }
            return result;
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
        
    }
    //合并Cookie
    private static String mergeCookie(String oldCookie, String newCookie) {
        if (newCookie == null) {
            return oldCookie;
        }
        Map<String, String> cookieMap = new HashMap<String, String>();
        String[] cookTmp = null;
        String[] cookieTab = null;
        StringBuilder valueTmp = new StringBuilder();
        String[] cookies = { oldCookie, newCookie };
        for (String currCookie : cookies) {
            if (StringUtil.isNullOrEmpty(currCookie)) {
                continue;
            }
            cookieTab = currCookie.split(";");
            for (String cook : cookieTab) {
                cookTmp = cook.split("=");
                if (cookTmp.length < 2) {
                    continue;
                }
                valueTmp = new StringBuilder();
                for (int i = 1; i < cookTmp.length; i++) {
                    valueTmp.append(cookTmp[i]);
                    if (i < cookTmp.length - 1) {
                        valueTmp.append("=");
                    }
                }
                if (StringUtil.findNull(cookTmp[0], valueTmp) > -1) {
                    continue;
                }
                cookieMap.put(cookTmp[0], valueTmp.toString());
            }
        }
        valueTmp = new StringBuilder();
        for (String key : cookieMap.keySet()) {
            valueTmp.append(key).append("=").append(cookieMap.get(key));
            valueTmp.append(";");
        }
        return valueTmp.toString();
    }
    
    private HttpURLConnection getConnection(String url) {
        if(config.httpModule.equals(HTTP_GENERAL)){
            return createConnectionGeneral(url);
        }
        if(config.httpModule.equals(HTTP_JSON)){
            return createConnectionJson(url);
        }
        return null;
    }
    
    public HttpEntity Get(String url){
            return Conn(url, GET, null);
    }
    
    public HttpEntity Post(String url,String data){
        return Conn(url, POST, data);
    }
    
    public HttpEntity Conn(String url, String method,
            String postData){
        if(url.contains(" ")){
            url=url.replace(" ", "%20");
        }
        HttpURLConnection conn = getConnection(url);
        if (conn == null) {
            return null;
        }
        if (!StringUtil.isNullOrEmpty(config.headerMap)) {
            for (String key : config.headerMap.keySet()) {
                conn.setRequestProperty(key, config.headerMap.get(key));
                key = null;
            }
        }
        if(!config.allowRedirects){
            conn.setInstanceFollowRedirects(false);
        }
        if (!StringUtil.isNullOrEmpty(config.cookie)) {
            conn.setRequestProperty("Cookie", config.cookie);
        }
        try {
            conn.setRequestMethod(method);
            if (method.equalsIgnoreCase(POST)||method.equalsIgnoreCase(PUT)) {
                conn.setDoOutput(true);
                byte [] postByte=postData.getBytes(config.encode);
                conn.setRequestProperty("Content-Length", String.valueOf(postByte.length));
                conn.getOutputStream().write(postByte);
                conn.connect();
                conn.getOutputStream().flush();
                conn.getOutputStream().close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        conn.setConnectTimeout(config.timeOut*1000);
        InputStream ins = null;
        HttpEntity hEntity = new HttpEntity();
        String key = "";
        StringBuilder cookie = new StringBuilder();
        try {
            Integer status=conn.getResponseCode();
            if (status !=HttpURLConnection.HTTP_OK) {
                ins=conn.getErrorStream();
            }else{
                ins=conn.getInputStream();
            }
            hEntity.setCode(conn.getResponseCode());
            Map<String,String> headMap=new HashMap<String, String>();
            for (int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++) {
                headMap.put(key, conn.getHeaderField(key));
                if (key.equalsIgnoreCase("set-cookie")) {
                    try {
                        cookie.append(conn.getHeaderField(i).replace("/", ""));
                    } catch (Exception e) {
                    }
                }
            }
            config.cookie = mergeCookie(config.cookie, cookie.toString());
            byte[] b = toByte(ins);
            if((headMap.get("Content-Encoding")!=null && headMap.get("Content-Encoding").contains("gzip"))||(conn.getRequestProperty("Accept-Encoding")!=null&&conn.getRequestProperty("Accept-Encoding").contains("gzip"))){
                b = GZIPUtils.uncompress(b);
            }
            hEntity.setEncode(config.encode);
            hEntity.setBye(b);
            hEntity.setCookie(config.cookie);
            hEntity.setHeadMap(headMap);
        } catch (Exception e) {
            e.printStackTrace();
        }finally{
            try {
                ins.close();
            } catch (Exception e2) {
            }
        }
        return hEntity;
    }
    
    
    private byte[] toByte(InputStream ins) {
        if(ins==null){
            return null;
        }
        ByteArrayOutputStream swapStream = null;
        try {
            swapStream = new ByteArrayOutputStream();
            byte[] buff = new byte[1024];
            int rc = 0;
            while ((rc = ins.read(buff, 0, 1024)) > 0) {
                swapStream.write(buff, 0, rc);
            }
            return swapStream.toByteArray();
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        } finally {
            try {
                swapStream.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    
    
    
    
}

其次需要一个Http响应对象类:HttpEntity

package org.coody.robot.util;

import java.util.HashMap;
import java.util.Map;

public class HttpEntity {
    
    
    
    private String html;
    private byte[] bye;
    private String cookie;
    private Integer code=-1;
    private Map<String,String> headMap;
    
    public Map<String, String> getHeadMap() {
        return headMap;
    }

    public void setHeadMap(Map<String, String> headMap) {
        this.headMap = headMap;    
    }

    private String encode="UTF-8";
    
    public String getHtml() {
        try {
            if(html!=null){
                return html;
            }
        if(bye==null){
            return null;
        }
            String str= new String(bye, encode);
            html=str;
            return str;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }
    
    public String getHtml(boolean isGzip) {
        try {
        if(bye==null){
            return null;
        }
            String str= new String(GZIPUtils.uncompress(bye), encode);
            return str;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public String getEncode() {
        return encode;
    }

    public void setEncode(String encode) {
        this.encode = encode;
    }

    public void setHtml(String html) {
        this.html = html;
    }

    public Integer getCode() {
        return code;
    }

    public void setCode(Integer code) {
        this.code = code;
    }

    public String getCookie() {
        return cookie;
    }

    public void setCookie(String cookie) {
        this.cookie = cookie;
    }

    public byte[] getBye() {
        return bye;
    }

    public void setBye(byte[] bye) {
        this.bye = bye;
    }

    public Map<String, String> getCookieMap() {
        if (cookie == null) {
            return null;
        }
        Map<String, String> cookieMap = new HashMap<String, String>();
        String[] cookies = cookie.split(";");
        for (String cook : cookies) {
            String[] tmps = cook.split("=");
            if (tmps.length >= 2) {
                String cookieValue = "";
                for (int i = 1; i < tmps.length; i++) {
                    cookieValue += tmps[i];
                    if (i < tmps.length-1) {
                        cookieValue += "=";
                    }
                }
                cookieMap.put(tmps[0].trim(), cookieValue.trim());
            }
        }
        return cookieMap;
    }
}

某些网站是有Gzip压缩的,需要一个Gzip压缩类GzipUtils

package org.coody.robot.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;  
  
  
  
/** 
 *  
 * @author wenqi5 
 *  
 */  
public class GZIPUtils {  
  
    public static final String GZIP_ENCODE_UTF_8 = "UTF-8";  
  
  
    /** 
     * 字符串压缩为GZIP字节数组 
     *  
     * @param str 
     * @return 
     */  
    public static byte[] compress(String str) {  
        try {
            return compress(str.getBytes("UTF-8"));
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
            return null;
        }  
    }  
  
    /** 
     * 字符串压缩为GZIP字节数组 
     *  
     * @param str 
     * @param encoding 
     * @return 
     */  
    public static byte[] compress(byte[] data) {  
        if (data == null || data.length == 0) {  
            return null;  
        }  
        ByteArrayOutputStream out = new ByteArrayOutputStream();  
        GZIPOutputStream gzip = null;  
        try {  
            gzip = new GZIPOutputStream(out);  
            gzip.write(data);  
        } catch (IOException e) {  
        }finally {
            try {
                gzip.close();  
            } catch (Exception e2) {
            }
            try {
                out.close();
            } catch (Exception e2) {
            }
        }
        return out.toByteArray();  
    }  
  
    /** 
     * GZIP解压�? 
     *  
     * @param bytes 
     * @return 
     */  
    public static byte[] uncompress(byte[] bytes) {  
        if (bytes == null || bytes.length == 0) {  
            return null;  
        }  
        ByteArrayOutputStream out = new ByteArrayOutputStream();  
        ByteArrayInputStream in = new ByteArrayInputStream(bytes);  
        try {  
            GZIPInputStream ungzip = new GZIPInputStream(in);  
            byte[] buffer = new byte[256];  
            int n;  
            while ((n = ungzip.read(buffer)) >= 0) {  
                out.write(buffer, 0, n);  
            }  
        } catch (IOException e) {  
        }  finally {
            try {
                in.close();
            } catch (Exception e2) {
            }
            try {
                out.close();
            } catch (Exception e2) {
            }
        }
  
        return out.toByteArray();  
    }  
}  

以上类均依赖一个StringUtil,笔者比较懒,也没有拆分出来

package org.coody.robot.util;

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//import oracle.sql.CLOB;

public class StringUtil {

    
    public static Integer[] getIntegerParas(Object[] objs) {
        if (isNullOrEmpty(objs)) {
            return null;
        }
        Integer[] ints = new Integer[objs.length];
        for (int i = 0; i < objs.length; i++) {
            try {
                ints[i] = Integer.valueOf(objs[i].toString());
            } catch (Exception e) {
            }
        }
        return ints;
    }

    /**
     * 生成指定数目字符串按分隔符分�?
     * 
     * @param baseStr
     * @param mosaicChr
     * @param size
     * @return
     */
    public static String getByMosaicChr(String baseStr, String mosaicChr, Integer size) {
        List<String> list = new ArrayList<String>();
        for (int i = 0; i < size; i++) {
            if (isNullOrEmpty(baseStr)) {
                continue;
            }
            list.add(baseStr);
        }
        return collectionMosaic(list, mosaicChr);
    }

    /**
     * 根据分割符将字符串分割成String数组
     * 
     * @param src
     *            源字符串
     * @param separator
     *            分隔�??
     * @return String数组
     */
    public static String[] splitToStringArray(String src, String separator) {
        Vector<String> splitArrays = new Vector<String>();
        int i = 0;
        int j = 0;
        while (i <= src.length()) {
            j = src.indexOf(separator, i);
            if (j < 0) {
                j = src.length();
            }
            splitArrays.addElement(src.substring(i, j));
            i = j + 1;
        }
        int size = splitArrays.size();
        String[] array = new String[size];
        System.arraycopy(splitArrays.toArray(), 0, array, 0, size);
        return array;
    }

    /**
     * 根据分割符将字符串分割成Integer数组
     * 
     * @param src
     *            源字符串
     * @param separator
     *            分隔�??
     * @return Integer数组
     */
    public static Integer[] splitToIntgArray(String src, String separator) {
        String[] arr = splitToStringArray(src, separator);
        Integer[] intArr = new Integer[arr.length];
        for (int i = 0; i < arr.length; i++) {
            intArr[i] = Integer.valueOf(arr[i]);
        }
        return intArr;
    }

    /**
     * 根据分隔符将字符串分割成int数组
     * 
     * @param src
     *            源字符串
     * @param separator
     *            分隔�??
     * @return int数组
     */
    public static int[] splitToIntArray(String src, String separator) {
        String[] arr = splitToStringArray(src, separator);
        int[] intArr = new int[arr.length];
        for (int i = 0; i < arr.length; i++) {
            intArr[i] = Integer.parseInt(arr[i]);
        }
        return intArr;
    }

    public static String getInPara(Integer size) {
        return getByMosaicChr("?", ",", size);

    }

    public static String textCutCenter(String allTxt, String firstTxt, String lastTxt) {
        try {
            String tmp = "";
            int n1 = allTxt.indexOf(firstTxt);
            if (n1 == -1) {
                return "";
            }
            tmp = allTxt.substring(n1 + firstTxt.length(), allTxt.length());
            int n2 = tmp.indexOf(lastTxt);
            if (n2 == -1) {
                return "";
            }
            tmp = tmp.substring(0, n2);
            return tmp;
        } catch (Exception e) {
            return "";
        }
    }

    public static List<String> textCutCenters(String allTxt, String firstTxt, String lastTxt) {
        try {
            List<String> results = new ArrayList<String>();
            while(allTxt.contains(firstTxt)){
                int n = allTxt.indexOf(firstTxt);
                allTxt=allTxt.substring(n+firstTxt.length(), allTxt.length());
                n=allTxt.indexOf(lastTxt);
                if(n==-1){
                    return results;
                }
                String result=allTxt.substring(0, n);
                results.add(result);
                allTxt=allTxt.substring(n+firstTxt.length(), allTxt.length());
            }
            return results;
        } catch (Exception e) {
            return null;
        }
    }
    public static String convertToUnicode(String source) {
        String result = "";
        char[] chrs = source.toCharArray();
        for (int i = 0; i < chrs.length; i++) {
            result += "&#" +Character.codePointAt(chrs, i);
        }
        return result;
    }
    public static Integer toInteger(Object obj) {
        if (isNullOrEmpty(obj)) {
            return null;
        }
        try {
            return Integer.valueOf(obj.toString());
        } catch (Exception e) {
            return null;
        }
    }

    public static String toString(Object obj) {
        if (isNullOrEmpty(obj)) {
            return null;
        }
        try {
            return String.valueOf(obj.toString());
        } catch (Exception e) {
            return null;
        }
    }

    public static Double toDouble(Object obj) {
        if (isNullOrEmpty(obj)) {
            return null;
        }
        try {
            return Double.valueOf(obj.toString());
        } catch (Exception e) {
            return null;
        }
    }

    public static Float toFloat(Object obj) {
        if (isNullOrEmpty(obj)) {
            return null;
        }
        try {
            return Float.valueOf(obj.toString());
        } catch (Exception e) {
            return null;
        }
    }

    public static Long toLong(Object obj) {
        if (isNullOrEmpty(obj)) {
            return null;
        }
        try {
            return Long.valueOf(obj.toString());
        } catch (Exception e) {
            return null;
        }
    }

    public static Integer getRanDom(int start, int end) {
        return (int) (Math.random() * (end - start + 1)) + start;
    }

    public static float getRanDom(Float start, Float end) {
        String str = String.valueOf(start);
        String[] tabs = str.split("\\.");
        Integer startLength = 1;
        if (tabs.length == 2) {
            startLength = tabs[1].length();
        }
        str = String.valueOf(end);
        tabs = str.split("\\.");
        Integer endLength = 1;
        if (tabs.length == 2) {
            endLength = tabs[1].length();
        }
        if (endLength > startLength) {
            startLength = endLength;
        }
        start = (float) (start * Math.pow(10, startLength));
        end = (float) (end * Math.pow(10, startLength));
        return (float) (getRanDom(start.intValue(), end.intValue()) / Math.pow(10, startLength));
    }

    public static String replaceBlank(String str) {
        String dest = "";
        if (str != null) {
            Pattern p = Pattern.compile("\\s*|\t|\r|\n");
            Matcher m = p.matcher(str);
            dest = m.replaceAll("");
        }
        return dest;
    }

    public static Boolean isMatcher(String val, String matcher) {
        Pattern p = Pattern.compile(matcher);
        Matcher m = p.matcher(val);
        return m.matches();
    }

    public static boolean isMobile(String mobile) {
        if (isNullOrEmpty(mobile)) {
            return false;
        }
        Pattern p = Pattern.compile("^((13[0-9])|(15[^4,\\D])|(17[^4,\\D])|(18[0,5-9]))\\d{8}$");
        Matcher m = p.matcher(mobile);
        return m.matches();
    }

    public static boolean isLegal(String str) {
        if (isNullOrEmpty(str)) {
            return false;
        }
        Pattern p = Pattern.compile("[A-Za-z0-9_]{3,16}");
        Matcher m = p.matcher(str);
        return m.matches();
    }

    public static boolean isEmail(String email) {
        if (isNullOrEmpty(email)) {
            return false;
        }
        Pattern p = Pattern.compile(
                "^([a-zA-Z0-9_\\-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([a-zA-Z0-9\\-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)$");
        Matcher m = p.matcher(email);
        return m.matches();
    }

    public static boolean isMd5(String md5) {
        if (isNullOrEmpty(md5)) {
            return false;
        }
        Pattern p = Pattern.compile("[A-Za-z0-9_]{16,40}");
        Matcher m = p.matcher(md5);
        return m.matches();
    }


    public static boolean isAllNull(Object... obj) {
        if (obj == null || obj.length == 0) {
            return true;
        }
        for (int i = 0; i < obj.length; i++) {
            if (!isNullOrEmpty(obj[i])) {
                return false;
            }
        }
        return true;
    }

    public static boolean isAllNull(List<Object> objs) {
        return isAllNull(objs.toArray());
    }

    /**
     * 把一个数组按照分隔符拼接成字符串
     * 
     * @param 数组参数
     * @param 分隔�?
     * @return
     */
    public static String collectionMosaic(Object[] objs, String mosaicChr) {
        if (isNullOrEmpty(objs)) {
            return null;
        }
        List<Object> objList = Arrays.asList(objs);
        return collectionMosaic(objList, mosaicChr);
    }

    /**
     * 把一个数组按照分隔符拼接成字符串
     * 
     * @param 数组参数
     * @param 分隔�?
     * @return
     */
    public static String collectionMosaic(int[] intObjs, String mosaicChr) {
        Object[] objs = new Object[intObjs.length];
        for (int i = 0; i < intObjs.length; i++) {
            objs[i] = String.valueOf(intObjs[i]);
        }
        return collectionMosaic(objs, mosaicChr);
    }

    /**
     * 把一个或多个字符串按照分隔符拼接成字符串
     * 
     * @param 数组参数
     * @param 分隔�?
     * @return
     */
    public static String collectionMosaic(String mosaicChr, Object... objs) {
        List<Object> objList = Arrays.asList(objs);
        return collectionMosaic(objList, mosaicChr);
    }

    /**
     * 把一个集合按照分隔符拼接成字符串
     * 
     * @param 集合参数
     * @param 分隔�?
     * @return 字符�?
     */
    public static String collectionMosaic(List<?> objs, String mosaicChr) {
        if (objs == null || objs.isEmpty()) {
            return null;
        }
        StringBuilder sb = new StringBuilder();
        int i = 0;
        for (Object obj : objs) {
            if (isNullOrEmpty(obj)) {
                continue;
            }
            sb.append(obj);
            if (i < objs.size() - 1) {
                sb.append(mosaicChr);
            }
            i++;
        }
        return sb.toString();
    }

    /**
     * 生成指定数目字符串按分隔符分�?
     * 
     * @param baseStr
     * @param mosaicChr
     * @param size
     * @return
     */
    public static String getStringSByMosaicChr(String baseStr, String mosaicChr, Integer size) {
        List<String> list = new ArrayList<String>();
        for (int i = 0; i < size; i++) {
            if (isNullOrEmpty(baseStr)) {
                continue;
            }
            list.add(baseStr);
        }
        return collectionMosaic(list, mosaicChr);
    }

    /**
     * 按照分隔符分�?,得到字符串集�?
     * 
     * @param text
     *            原字符串
     * @param mosaiChr
     *            分隔�?
     * @return list
     */
    public static List<String> splitByMosaic(String text, String mosaiChr) {
        if (text == null || mosaiChr == null) {
            return null;
        }
        String[] tab = text.split(mosaiChr);
        List<String> list = new ArrayList<String>();
        for (int i = 0; i < tab.length; i++) {
            if (isNullOrEmpty(tab[i])) {
                continue;
            }
            list.add(tab[i]);
        }
        return list;
    }

    /**
     * 按照分隔符分�?,得到字符串集�?
     * 
     * @param text
     *            原字符串
     * @param mosaiChr
     *            分隔�?
     * @return list
     */
    public static List<Integer> splitByMosaicInteger(String text, String mosaiChr) {
        if (text == null || mosaiChr == null) {
            return null;
        }
        String[] tab = text.split(mosaiChr);
        List<Integer> list = new ArrayList<Integer>();
        for (int i = 0; i < tab.length; i++) {
            if (isNullOrEmpty(tab[i])) {
                continue;
            }
            try {
                list.add(Integer.valueOf(tab[i]));
            } catch (Exception e) {
            }

        }
        return list;
    }

    /**
     * 按照分隔符分�?,得到字符串集�?
     * 
     * @param text
     *            原字符串
     * @param mosaiChr
     *            分隔�?
     * @return list
     */
    public static Integer[] splitByMosaicIntegers(String text, String mosaiChr) {
        if (text == null || mosaiChr == null) {
            return null;
        }
        String[] tab = text.split(mosaiChr);
        Integer[] list = new Integer[tab.length];
        for (int i = 0; i < tab.length; i++) {
            if (isNullOrEmpty(tab[i])) {
                continue;
            }
            try {
                list[i] = Integer.valueOf(tab[i]);
            } catch (Exception e) {
            }

        }
        return list;
    }

    public static List<String> doMatcher(String context, String pat) {
        try {
            List<String> images = new ArrayList<String>();
            Integer index = 0;
            Pattern pattern = Pattern.compile(pat, Pattern.DOTALL);
            Matcher matcher = pattern.matcher(context);
            String tmp = null;
            while (matcher.find(index)) {
                tmp = matcher.group(0);
                index = matcher.end();
                if (StringUtil.isNullOrEmpty(tmp)) {
                    continue;
                }
                images.add(tmp);
            }
            return images;
        } catch (Exception e) {
            return null;
        }
    }

    public static String doMatcherFirst(String context, String pat) {
        List<String> strs = doMatcher(context, pat);
        if (StringUtil.isNullOrEmpty(strs)) {
            return null;
        }
        return strs.get(0);
    }

    public static boolean isNullOrEmpty(Object obj) {
        try {
            if (obj == null)
                return true;
            if (obj instanceof CharSequence) {
                return ((CharSequence) obj).length() == 0;
            }
            if (obj instanceof Collection) {
                return ((Collection<?>) obj).isEmpty();
            }
            if (obj instanceof Map) {
                return ((Map<?, ?>) obj).isEmpty();
            }
            if (obj instanceof Object[]) {
                Object[] object = (Object[]) obj;
                if (object.length == 0) {
                    return true;
                }
                boolean empty = true;
                for (int i = 0; i < object.length; i++) {
                    if (!isNullOrEmpty(object[i])) {
                        empty = false;
                        break;
                    }
                }
                return empty;
            }
            return false;
        } catch (Exception e) {
            return true;
        }

    }

    public static Integer findNull(Object... objs) {
        if (isNullOrEmpty(objs)) {
            return 0;
        }
        for (int i = 0; i < objs.length; i++) {
            if (isNullOrEmpty(objs[i])) {
                return i;
            }
        }
        return -1;
    }

    public static boolean hasNull(Object... objs) {
        return findNull(objs)>-1;
    }
    // 判断是否为数�?
    public static Boolean isNumber(String str) {
        if (isNullOrEmpty(str)) {
            return false;
        }
        try {
            Integer.valueOf(str);
            return true;
        } catch (Exception e) {
            return false;
        }
    }

    public static String argsToString(String[] args) {
        StringBuilder sb = new StringBuilder();
        for (String tmp : args) {
            sb.append(tmp);
        }
        return sb.toString();
    }

    // 字符串意义分�?
    public static String[] splitString(String str) {
        if (isNullOrEmpty(str)) {
            return null;
        }
        String[] finalStrs = new String[str.length()];
        for (int i = 0; i < str.length(); i++) {
            finalStrs[i] = str.substring(i, i + 1);
        }
        return finalStrs;
    }

    public static String getString(Object... objs) {
        if (isNullOrEmpty(objs)) {
            return "";
        }
        StringBuilder sb = new StringBuilder();
        for (Object obj : objs) {
            if (isNullOrEmpty(obj)) {
                sb.append("null");
            }
            sb.append(String.valueOf(obj));
        }
        return sb.toString();
    }

    public static String stringSort(String str) {
        if (isNullOrEmpty(str)) {
            return "";
        }
        String[] strs = splitString(str);
        Arrays.sort(strs);
        return argsToString(strs);
    }

    /**
     * 集合碰撞
     * 
     * @param needList
     *            �?要的集合
     * @param actualList
     *            当前实际集合
     * @return 缺少的元�?
     */
    public static List<?> collisionList(List<?> needList, List<?> actualList) {
        List<Object> list = new ArrayList<Object>();
        for (Object o : needList) {
            if (actualList.contains(o)) {
                continue;
            }
            list.add(o);
        }
        if (isNullOrEmpty(list)) {
            return null;
        }
        return list;
    }

    public static List<Long> integerListToLong(List<Integer> ids) {
        if (isNullOrEmpty(ids)) {
            return null;
        }
        List<Long> list = new ArrayList<Long>();
        for (Integer id : ids) {
            list.add(Long.valueOf(id));
        }
        return list;
    }

    /**
     * List碰撞取缺�?
     * 
     * @param allList
     *            理论应该出现的List
     * @param conflictList
     *            实际出现的List
     * @return 丢失的List
     */
    public static List<?> listConflict(List<?> allList, List<?> conflictList) {
        if (isNullOrEmpty(allList)) {
            return null;
        }
        if (isNullOrEmpty(conflictList)) {
            return allList;
        }
        List<Object> list = new ArrayList<Object>();
        for (Object obj : allList) {
            if (conflictList.contains(obj)) {
                continue;
            }
            list.add(obj);
        }
        if (isNullOrEmpty(list)) {
            return null;
        }
        return list;
    }

    public static Integer bambooParse(Integer... prs) {
        Integer prSum = 0;
        for (Integer pr : prs) {
            prSum += pr;
        }
        Integer random = getRanDom(1, prSum);
        prSum = 0;
        for (int i = 0; i < prs.length; i++) {
            prSum += prs[i];
            if (random <= prSum) {
                return i;
            }
        }
        return 0;
    }

    public static Integer SumInteger(Integer... sums) {
        if (isNullOrEmpty(sums)) {
            return -1;
        }
        Integer total = 0;
        for (Integer tmp : sums) {
            total += tmp;
        }
        return total;
    }

    /**
     * 概率算法
     * 
     * @param chances
     *            各成员概率权�?
     * @return 权重下标
     */
    public static Integer getBambooIndex(Integer... chances) {
        if (isNullOrEmpty(chances)) {
            return -1;
        }
        Integer total = SumInteger(chances);
        Integer random = getRanDom(1, total);
        total = new Integer(0);
        for (int i = 0; i < chances.length; i++) {
            total += chances[i];
            if (random <= total) {
                return i;
            }
        }
        return -1;
    }

    public static List<?> removeEmpty(List<?> list) {
        if (StringUtil.isNullOrEmpty(list)) {
            return null;
        }
        List<Object> newList = new ArrayList<Object>(list.size());
        for (Object obj : list) {
            if (isNullOrEmpty(obj)) {
                continue;
            }
            newList.add(obj);
        }
        if (isNullOrEmpty(newList)) {
            return null;
        }
        return newList;
    }

    public static Integer getBambooIndex(Float... chanceSources) {
        if (isNullOrEmpty(chanceSources)) {
            return -1;
        }
        Float[] chances = Arrays.copyOf(chanceSources, chanceSources.length);
        Integer smallLength = 0;
        for (Float f : chances) {
            String str = String.valueOf(f);
            String[] tabs = str.split("\\.");
            if (tabs.length != 2) {
                continue;
            }
            smallLength = tabs[1].length();
        }
        if (smallLength > 0) {
            Integer multiple = Double.valueOf(Math.pow(10, smallLength)).intValue();
            for (int i = 0; i < chances.length; i++) {
                chances[i] = chances[i] * multiple;
            }
        }
        Integer[] chanceInts = new Integer[chances.length];
        for (int i = 0; i < chances.length; i++) {
            chanceInts[i] = chances[i].intValue();
        }
        return getBambooIndex(chanceInts);
    }

    public static Float floatCut(Float f1, Float f2) {
        BigDecimal b1 = new BigDecimal(Float.toString(f1));
        BigDecimal b2 = new BigDecimal(Float.toString(f2));
        return b1.subtract(b2).floatValue();
    }

    /**
     * 获取网址后缀
     * 
     * @param url
     * @return
     */
    public static String getSuffix(String url) {
        if (isNullOrEmpty(url)) {
            return "";
        }
        String[] tab = url.split("\\.");
        if (tab.length > 1) {
            return tab[tab.length - 1];
        }
        return "";
    }

}

为了方便我们使用,特意为百度云防护的网站封装了一个工具类RobotHttpHandle,维护了Cookie机制

package org.coody.robot.rote;

import java.util.Date;

import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;

import org.coody.robot.util.HttpEntity;
import org.coody.robot.util.HttpHandle;
import org.coody.robot.util.StringUtil;


public class RobotHttpHandle {

    public static String cookie="";
    
    
    public RobotHttpHandle(){
        
    }
    
    
    private HttpEntity initCookie(String url){
        try {
            String baseURL=HttpHandle.getDomain(url);
            HttpHandle http=new HttpHandle();
            http.config.setRequestProperty("If-Modified-Since", new Date().toString());
            http.config.setRequestProperty("Cache-Control", "max-age=0");
            http.config.setRequestProperty("Upgrade-Insecure-Requests", "1");
            http.config.setKeepAlive(true);
            HttpEntity entity = http.Get(baseURL);
            System.out.println(entity.getCookie());
            String html = entity.getHtml();
            String temp = html.replace(" ", "");
            String jschl_vc = StringUtil.textCutCenter(temp, "jschl_vc\"value=\"", "\"");
            String pass = StringUtil.textCutCenter(temp, "pass\"value=\"", "\"");

            String funcCode = StringUtil.textCutCenter(html, "setTimeout(function(){", "f.submit();");

            funcCode = funcCode.replace("a.value", "a");
            funcCode = funcCode.replace("  ", " ");
            String[] tabs = funcCode.split("\n");
            funcCode = tabs[1];
            funcCode += "\r\nt=\"" + baseURL + "\";";
            funcCode += "\r\nr = t.match(/https?:\\/\\//)[0];";
            funcCode += "\r\nt = t.substr(r.length);";
            funcCode += "\r\nt = t.substr(0, t.length - 1);";
            funcCode += tabs[8];
            funcCode += "\r\n return a;";

            funcCode = "function jschl_answer(){\r\n" + funcCode + "\r\n}";

            ScriptEngineManager manager = new ScriptEngineManager();
            ScriptEngine engine = manager.getEngineByName("js");
            engine.eval(funcCode);
            Invocable invocable = (Invocable) engine;
            Double jschl_answer = (Double) invocable.invokeFunction("jschl_answer");
            url=baseURL+"/cdn-cgi/l/chk_jschl?jschl_vc="+jschl_vc+"&pass="+pass+"&jschl_answer="+jschl_answer.intValue();
            http.config.allowRedirects(false);
            System.out.println(url);
            Thread.sleep(3800l);
            http.config.setGzip(true);
            entity=http.Get(url);
            cookie=entity.getCookie();
            if(!cookie.contains("cf_clearance")){
                return null;
            }
            return entity;
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }
    
    public HttpEntity Get(String url){
        if(cookie!=null&&!"".equals(cookie)){
            loadCookie(url);
        }
        HttpHandle http=new HttpHandle();
        http.config.setRequestProperty("If-Modified-Since", new Date().toString());
        http.config.setRequestProperty("Cache-Control", "max-age=0");
        http.config.setRequestProperty("Upgrade-Insecure-Requests", "1");
        http.config.setKeepAlive(true);
        http.config.setCookie(cookie);
        HttpEntity entity=http.Get(url);
        if(entity.getCode()!=200){
            loadCookie(url);
            http.config.setCookie(cookie);
            entity=http.Get(url);
        }
        return entity;
    }
    
    public void loadCookie(String url){
        cookie=null;
        HttpEntity entity=initCookie(url);
        while(entity==null){
            entity=initCookie(url);
        }
    }
    
    
    public static void main(String[] args) throws NoSuchMethodException, ScriptException, InterruptedException {
        HttpEntity entity=new RobotHttpHandle().Get("http://www.myexception.cn/");
        System.out.println(entity.getHtml());
    }
}

使用方式:

    HttpEntity entity=new RobotHttpHandle().Get("http://www.myexception.cn/");
    System.out.println(entity.getHtml());

如图:

Java爬虫技术之绕过百度云防护抓取网站内容

点赞
收藏
评论区
推荐文章
待兔 待兔
3个月前
手写Java HashMap源码
HashMap的使用教程HashMap的使用教程HashMap的使用教程HashMap的使用教程HashMap的使用教程22
Easter79 Easter79
3年前
Vue+Flask实现简单的登录验证跳转
文件位置:!输入图片说明(https://static.oschina.net/uploads/img/201711/27171542_g7Yr.png"在这里输入图片标题")login.html<!DOCTYPEhtml<htmllang"en"<head<metacharset
Stella981 Stella981
3年前
Fiddle设置iphone抓包
_注意:保证手机和电脑在一个网络上_一,设置Fiddler!输入图片说明(https://static.oschina.net/uploads/img/201707/27100350_ckfa.png"在这里输入图片标题")!输入图片说明(https://static.oschina.net/uploads/img/201707
Stella981 Stella981
3年前
Intellij idea或者Android Studio实用Live_Templates好用模板整理
Live\_Templates的作用是自动补全代码:自定义补全代码:单例模式:sin!输入图片说明(https://static.oschina.net/uploads/img/201707/17114328_kR84.png"在这里输入图片标题")Templatetext:privatestatic
Wesley13 Wesley13
3年前
#H5#WebStorm的警告
!输入图片说明(https://static.oschina.net/uploads/img/201611/05163209_Lj0x.png"在这里输入图片标题")今天写了个小时的HTML代码,WS提示forminputwithoutanassociatedlabelortitleattribute,然后搜索了下看到百度知道的一个
Stella981 Stella981
3年前
Canvas
Canvas04柱状图!输入图片说明(https://static.oschina.net/uploads/img/201707/04223024_wxwr.png"在这里输入图片标题")代码如下:<!DOCTYPEhtml<htmllang"zhCN"<head<metacharset"UTF8"<met
Stella981 Stella981
3年前
IntelliJ IDEA 15搭建maven结构的web项目
1.首先得安装好IDEA软件和maven,具体怎么安装百度。\2.打开idea编辑器,fileSettingsmaven导入maven插件\!输入图片说明(https://static.oschina.net/uploads/img/201608/03092537_VGr9.png"在这里输入图片标题")!输
Stella981 Stella981
3年前
Elasticsearch学习总结八 ElasticSearch中的聚合操作
首先准备数据,索引包含四个字段fieldA,fieldB,fieldC,fieldD,如下图,以下案列中都使用了基本REST命令和JavaAP两种方式实现!输入图片说明(https://static.oschina.net/uploads/img/201706/15212621_M2dc.png"在这里输入图片标题")1).首先按照某
Wesley13 Wesley13
3年前
PHP 与 GO
PHP!输入图片说明(https://static.oschina.net/uploads/img/201608/22112754_GEtW.png"在这里输入图片标题")输出json!输入图片说明(https://static.oschina.net/uploads/img/201608/22112825_n
Stella981 Stella981
3年前
Delivery Pipeline
deliverypipeline!输入图片说明(https://static.oschina.net/uploads/img/201702/04110334_cl3u.png"在这里输入图片标题")下面这个是netflix的部署流程:!输入图片说明(https://static.oschina.net/uploads/img/2017