最近花时间学习了一下使用Java获取网站数据的方法,自己也亲自动手实践一下;共获取3000+数据,去除重复的数据剩余2000+,使用JFreeChart根据电影评分做出几张简单的统计图。
电影评分统计图:
JFreeChart生成图片
使用jsoup获取该网站的电影数据信息,此网站动态加载数据,如果直接查看网页源代码是看不到数据的。可以通过js文件,获取相应的数据:
部分代码如下:
movieServlet.java
主要的功能为:获取网站的电影数据
首先获取每一个电影分类的链接:
HashMap<String, String> urlandnames = new HashMap<String, String>(); MovieService movieService = new MovieService(); // 排行榜页面 String url = "http://movie.douban.com/chart"; // 获取分类的所有相对链接和分类名称 try { Document kinds = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36") .timeout(10000) .get(); Elements elements = kinds.select("#content .types a"); for(Element element : elements){ String kindurl = element.attr("href"); // 链接地址 String name = element.text(); // 类别 urlandnames.put(kindurl,name); } } catch (IOException e) { e.printStackTrace(); System.out.println("获取urlandname出现错误!!"); } //获取所有的key Set<String> keySet = urlandnames.keySet(); //迭代key值 Iterator<String> iterator = keySet.iterator(); List<Movie> allMovies = new ArrayList<Movie>(); while(iterator.hasNext()){ // 获取到key值,即url String next = iterator.next(); // 根据某一个类别的链接,获取行对应的电影数据 List<Movie> listMovie = getMovieInfo(next); allMovies.addAll(listMovie); }
根据对应的链接获取相应的数据,保存至数据库:
/** * 获取种类电影信息,保存到数据库 * @param url 某一个种类的链接地址 */ private List<Movie> getMovieInfo(String url){ String[] tempurl = url.split("&"); String finalurl = "http://movie.douban.com/j/chart/top_list_count?"+tempurl[1]+"&"+tempurl[2]; // finalurl ---------http://movie.douban.com/j/chart/top_list_count?type=18&interval_id=100:90 String document = null; try { //获取该类别影片的数量total、可在线观看数量playable_count document = Jsoup.connect(finalurl).timeout(10000).ignoreContentType(true).execute().body(); // document------{"playable_count":18,"total":32,"unwatched_count":32}可在线观看18部,共32部,未观看32部 } catch (IOException e) { e.printStackTrace(); } //json解析器 JsonParser parser = new JsonParser(); //获取json对象 JsonObject jsonObject = (JsonObject) parser.parse(document); //将json数据转为int型数据 int movienum = jsonObject.get("total").getAsInt(); System.out.println(movienum);//该类型的数量 String nameurl = "http://movie.douban.com/j/chart/top_list?"+tempurl[1]+"&"+tempurl[2]+"&action=&start=0&limit="+movienum; // nameurl-------------http://movie.douban.com/j/chart/top_list?type=18&interval_id=100:90&action=&start=0&limit=32 FileWriter fw = null; String doc = null; try { //获取该类别的所有影片的信息 doc = Jsoup.connect(nameurl).timeout(10000).ignoreContentType(true).execute().body(); } catch (Exception e) { e.printStackTrace(); } //将json的一个对象数组解析成JsonElement对象 JsonElement element = null; try { //通过JsonParser对象可以把json格式的字符串解析成一个JsonElement对象 element = parser.parse(doc); } catch (NullPointerException e) { e.printStackTrace(); } JsonArray jsonArray = null; if(element.isJsonArray()){ //JsonElement对象如果是一个数组的话转化成jsonArray jsonArray = element.getAsJsonArray(); } //遍历json的对象数组 Iterator it = jsonArray.iterator(); List<Movie> listMovie = new ArrayList<Movie>(); while (it.hasNext()) { JsonObject e = (JsonObject)it.next(); //电影名称 String name = e.get("title").getAsString(); //豆瓣评分 float score = e.get("score").getAsFloat(); //发布时间 String release_date = e.get("release_date").getAsString(); //类型 JsonArray jsonArray2 = e.get("types").getAsJsonArray(); String types = jsonArray2.toString(); //链接地址 String movieUrl = e.get("url").getAsString(); //是否可以在线播放 String is_playable = e.get("is_playable").getAsString(); String substring = movieUrl.substring(0, movieUrl.lastIndexOf("/")); String keyID = substring.substring(substring.lastIndexOf("/"), substring.length()); if(cache.get(keyID) != null){ String value = (String) cache.get(keyID).getObjectValue(); if(!name.equals(value)){ net.sf.ehcache.Element element2 = new net.sf.ehcache.Element(keyID,name); cache.put(element2); }else { // System.out.println("重复的 movie Info"); continue; } }else { net.sf.ehcache.Element element2 = new net.sf.ehcache.Element(keyID,name); cache.put(element2); } Movie movie = new Movie(); movie.setName(name); movie.setTypes(types); movie.setRelease_date(release_date); movie.setScore(score); movie.setMovieUrl(movieUrl); movie.setIs_playable(is_playable); //在控制台输出 // System.out.println(movie.toString()); // System.out.println("正在获取数据ing..."); listMovie.add(movie); } return listMovie; }
ScoreServlet.java 主要是生成图表
生成柱状图:
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { String method = request.getParameter("method"); System.out.println(method+"===================method"); MovieService movieService = new MovieService(); Map<String, Integer> map = movieService.Count(); Integer one = map.get("one"); Integer two = map.get("two"); Integer three = map.get("three"); Integer four = map.get("four"); Integer five = map.get("five"); if(method.equals("barChart")){ double [][]data = new double[][]{{one},{two},{three},{four},{five}}; String []rowKeys = {">=9",">=8.5",">=8",">=7.5","<7.5"}; String []columnKeys = {"评分"}; CategoryDataset dataset = DatasetUtilities.createCategoryDataset(rowKeys, columnKeys, data); JFreeChart chart = ChartFactory.createBarChart3D( "电影评分柱状图", // 图表标题 "电影", // 目录轴的显示标签 "数量", // 数值轴的显示标签 dataset, // 数据集 PlotOrientation.VERTICAL, // 图表方向:水平、垂直 true, // 是否显示图例(对于简单的柱状图必须是 false) false, // 是否创建工具提示 (tooltip) false // 是否生成 URL 链接 ); CategoryPlot plot = chart.getCategoryPlot(); // 设置网格背景颜色 plot.setBackgroundPaint(Color.white); // 设置网格竖线颜色 plot.setDomainGridlinePaint(Color.pink); // 设置网格横线颜色 plot.setRangeGridlinePaint(Color.pink); // 显示每个柱的数值,并修改该数值的字体属性 BarRenderer3D renderer=new BarRenderer3D(); renderer.setBaseItemLabelGenerator(new StandardCategoryItemLabelGenerator()); renderer.setBaseItemLabelsVisible(true); renderer.setBasePositiveItemLabelPosition(new ItemLabelPosition(ItemLabelAnchor.OUTSIDE12, TextAnchor.BASELINE_LEFT)); renderer.setItemLabelAnchorOffset(10D); // 设置平行柱的之间距离 renderer.setItemMargin(0.4); plot.setRenderer(renderer); FileOutputStream fos_jpg = null; try { //将图片保存至Tomcat服务器WebRoot下的img目录中 fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"barChart.jpg"); ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null); } catch (Exception e) { System.out.println("error"); } finally { try { fos_jpg.close(); } catch (Exception e) { System.out.println("error2"); } } request.setAttribute("barChart", "barChart.jpg"); }
生成饼状图:
MovieService movieService = new MovieService(); Map<String, Integer> map = movieService.Count(); Integer one = map.get("one"); Integer two = map.get("two"); Integer three = map.get("three"); Integer four = map.get("four"); Integer five = map.get("five"); if (method.equals("pieChart")) { DefaultPieDataset data = new DefaultPieDataset(); data.setValue(">=9",one); data.setValue(">=8.5",two); data.setValue(">=8",three); data.setValue(">=7.5",four); data.setValue("<7.5",five); JFreeChart chart = ChartFactory.createPieChart3D( "评分饼状图", // 图表标题 data, true, // 是否显示图例 false, // 是否创建工具提示 (tooltip) false // 是否生成 URL 链接 ); //显示百分比 PiePlot pieplot = (PiePlot)chart.getPlot(); pieplot.setLabelFont(new Font("宋体", 0, 12)); pieplot.setNoDataMessage("无数据"); pieplot.setCircular(true); pieplot.setLabelGap(0.02D); pieplot.setLabelGenerator(new StandardPieSectionLabelGenerator("{0} {2}",NumberFormat.getNumberInstance(),new DecimalFormat("0.00%"))); PiePlot3D pieplot3d = (PiePlot3D)chart.getPlot(); //设置开始角度 pieplot3d.setStartAngle(120D); //设置方向为”顺时针方向“ pieplot3d.setDirection(Rotation.CLOCKWISE); //设置透明度,0.5F为半透明,1为不透明,0为全透明 pieplot3d.setForegroundAlpha(0.7F); FileOutputStream fos_jpg = null; try { //将图片保存至Tomcat服务器WebRoot目录下 fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"pieChart.jpg"); ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null); } catch (Exception e) { System.out.println("error"); } finally { try { fos_jpg.close(); } catch (Exception e) { System.out.println("error2"); } } request.setAttribute("pieChart", "pieChart.jpg"); }
生成折线图
if (method.equals("lineChart")) { XYSeriesCollection collection = new XYSeriesCollection(); XYSeries series = new XYSeries("折线"); Map<String, Integer> map2 = movieService.lineChart(); int number = 99; for(int i=0; i<map2.size(); i++){ String s= number+""; String score = s.charAt(0)+"."+s.charAt(1); series.add(Double.parseDouble(score),map2.get(score)); // System.out.println(Double.parseDouble(score)+"--"+map2.get(score)); number--; } collection.addSeries(series); JFreeChart chart = ChartFactory.createXYLineChart( "评分折线图", "评分", "数量", collection, PlotOrientation.VERTICAL, true, true, false); XYPlot plot = (XYPlot) chart.getPlot(); //设置曲线是否显示数据点 XYLineAndShapeRenderer xylinerenderer = (XYLineAndShapeRenderer)plot.getRenderer(); xylinerenderer.setBaseShapesVisible(true); //设置曲线显示各数据点的值 XYItemRenderer xyitem = plot.getRenderer(); xyitem.setBaseItemLabelsVisible(true); xyitem.setBasePositiveItemLabelPosition(new ItemLabelPosition(ItemLabelAnchor.OUTSIDE12, TextAnchor.BASELINE_CENTER)); xyitem.setBaseItemLabelGenerator(new StandardXYItemLabelGenerator()); xyitem.setBaseItemLabelFont(new Font("Dialog", 1, 10)); plot.setRenderer(xyitem); FileOutputStream fos_jpg = null; try { //将图片保存至Tomcat服务器WebRoot目录下 fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"lineChart.jpg"); ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null); } catch (Exception e) { System.out.println("error"); } finally { try { fos_jpg.close(); } catch (Exception e) { System.out.println("error2"); } } request.setAttribute("lineChart", "lineChart.jpg"); }
MovieDao.java
把数据插入到数据库
public class MovieDao { /** * 把获取的数据,一次性插入 * @param listMovie */ public void save(List<Movie> listMovie){ Connection connection = null; PreparedStatement statement = null; connection = JdbcUtils.getConnection(); try { int i = 1; for(Movie movie : listMovie){ System.out.println("正在插入第"+(i++)+"条数据到数据库ing..."); String sql = " INSERT INTO movie(NAME,TYPES,release_date,score,movieUrl,is_playable) VALUE( ?,?,?,?,?,? ) "; statement = connection.prepareStatement(sql); statement.setString(1, movie.getName()); statement.setString(2, movie.getTypes()); statement.setString(3, movie.getRelease_date()); statement.setFloat(4, movie.getScore()); statement.setString(5, movie.getMovieUrl()); statement.setString(6, movie.getIs_playable()); statement.execute(); } System.out.println("保存数据完成"); } catch (SQLException e) { System.out.println("保存数据出现错误 MovieDao error"); e.printStackTrace(); throw new RuntimeException(e); } finally { try { connection.close(); statement.close(); } catch (SQLException e) { e.printStackTrace(); throw new RuntimeException(e); } } }
查询所有数据
/** * 查询所有数据 * @return */ public List<Movie> findAll(){ Connection connection = null; PreparedStatement statement = null; ResultSet resultSet = null; try { connection = JdbcUtils.getConnection(); String sql = " select * from movie "; statement = connection.prepareStatement(sql); resultSet = statement.executeQuery(); List<Movie> list = new ArrayList<Movie>(); while (resultSet.next()) { Movie movie = new Movie(); movie.setId(resultSet.getInt("id")); movie.setName(resultSet.getString("name")); movie.setTypes(resultSet.getString("types")); movie.setRelease_date(resultSet.getString("release_date")); movie.setScore(resultSet.getFloat("score")); movie.setMovieUrl(resultSet.getString("movieUrl")); movie.setIs_playable(resultSet.getString("is_playable")); list.add(movie); } return list; } catch (SQLException e) { e.printStackTrace(); throw new RuntimeException(e); } finally { try { connection.close(); statement.close(); } catch (SQLException e) { e.printStackTrace(); throw new RuntimeException(e); } } }
获取不同分数等级的电影数量
/** * 统计不同分数级别的电影数量 * @return */ public Map<String,Integer> Count(){ Connection conn = null; PreparedStatement stmt = null; ResultSet resultSet = null; Map<String,Integer> mapCount = new HashMap<String, Integer>(); conn = JdbcUtils.getConnection(); String sql = null; String key = null; int i=0; while (i<5) { switch (i) { case 0: sql = "SELECT COUNT(1) FROM movie WHERE score>=9 "; key = "one"; break; case 1: sql = "SELECT COUNT(1) FROM movie WHERE score>=8.5 && score<9 "; key = "two"; break; case 2: sql = "SELECT COUNT(1) FROM movie WHERE score>=8 && score<8.5 "; key = "three"; break; case 3: sql = "SELECT COUNT(1) FROM movie WHERE score>=7.5 && score<8 "; key = "four"; break; case 4: sql = "SELECT COUNT(1) FROM movie WHERE score<7.5 "; key = "five"; break; } try { stmt = conn.prepareStatement(sql); resultSet = stmt.executeQuery(); while (resultSet.next()) { mapCount.put(key, resultSet.getInt(1)); } } catch (SQLException e) { e.printStackTrace(); } i++; } return mapCount; }
获取每个电影评分的电影数量
/** * 统计每个分数对应的数量 * @return */ public Map<String,Integer> lineChart(){ Connection conn = null; PreparedStatement stmt = null; ResultSet resultSet = null; Map<String,Integer> mapCount = new HashMap<String, Integer>(); conn = JdbcUtils.getConnection(); String sql = null; int number = 99; for( ; number>=70; number-=1){ //获取9.9 9.1 7.4 ..... String s = number+""; String score = s.charAt(0)+"."+s.charAt(1); sql = "SELECT COUNT(1) FROM movie WHERE score=" + score ; try { stmt = conn.prepareStatement(sql); resultSet = stmt.executeQuery(); while (resultSet.next()) { mapCount.put(score, resultSet.getInt(1)); } } catch (SQLException e) { e.printStackTrace(); } } return mapCount; } }
两分钟抓取数据2000+并保存至数据库中,感觉还是挺慢的,有待优化代码
代码源码:
GitHub:https://github.com/YanKuan-IT/DouBanMoviesInfo_DB.git
注:如有什么做的不对的,请指教