【校招VIP】Java爬虫实战代码- 校招VIP

转载声明：原文链接：https://blog.csdn.net/xiongyouqiang/article/details/79380177

业务背景
大家在平时的生活或工作种多少都会遇到类似下面的情况吧

非技术人员：
我身边有同学在一家装修设计公司上班，她每天的工作就是去其他各大装修平台，去“借鉴”别人家设计师的创意，找到合适的图片，就会一张张点击图片另存到自己电脑中。
其实这些工作都是重复性且毫无技术含量，完全可以用工具自动化实现。

技术人员：
比如我喜欢看一些技术帖子（微信公总号，技术博客等），有时候会觉得文章中的一些技术原理、架构图片非常直观，为了方便下次巩固这些技术，我一般都会把图片保存下来。
如果图片不多的话，一般有如下方法
1 点击图片另存为（原图像素还不错）
2 用手机拍照（像素不好）
如果要下载保存的图片过多，通过以上两种方式去抓取图片，有两个弊端
1 效率低下
2 重复工作，浪费不必要的时间
作为一位懒惰的码农，怎么可以把时间浪费在不需要脑力的事情上呢？为了减少重复性的工作，便有了这篇文章，我这里写的并不是很深入，只是提供一个思路，实现简单从网页中抓取所有图片并重命名保存到电脑中；希望对大家有所帮助。

开发环境
jdk1.6&以上
Eclipse或Intellij idea
Maven
编码

package com.xyq.maventest.util;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;


/****
* 
* @ClassName: DownloadImageUtil 
* @Description: 此类主要作用从一个网址上爬图片，然后重命名保存到本地路径中
* @author youqiang.xiong
* @date 2018年2月26日 下午12:09:29 
*
*/
public class DownloadImageUtil {

    /***
     * 请求的网址url常量
     */
    public static final String REQUEST_URL = "https://www.cnblogs.com/EasonJim/p/6919369.html";
    /****
     * 图片保存路径
     */
    public static final String IMAGE_SAVE_PATH = "C:\\Users\\youqiang.xiong\\Desktop\\image\\test";

    /***
     *  获取img标签正则表达式
     */
    public static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";  
    /****
     * 获取src路径的正则  
     */
    public static final String IMGSRC_REG = "(http|https):\"?(.*?)(\"|>|\\s+)"; 


    public static String[] IMAGE_TYPE_SUFFIX = new String[]{"=png","=jpg","=jpeg",".png",".jpg","jpeg"};
    /****
     * 生成图片的名称默认从1开始递增
     */
    public static Integer imageIndex = 1; 

    public static void main(String[] args) {

        //第一步通过请求url解析出响应内容
        String htmlContent = parseContext(REQUEST_URL);
        //通过正则表达式匹配，取出data-src的图片链接存放到list数组中
        //<img class="" data-ratio="0.5993031358885017" data-src="https://mmbiz.qpic.cn/mmbiz_png/dkwuWwLoRK8POMmicDvKwHwYrqrG7KyiaCGBdaib7rOlRlCSfLqaecaXeJvyRGwZZyvmvL9YGiaicNlLs6jlLKaia1icA/640?wx_fmt=png" data-type="png" data-w="861" height="516" style="margin: auto;max-width: 80%;box-sizing: inherit;-webkit-tap-highlight-color: transparent;border-width: initial;border-style: none;border-color: initial;" width="861"  />
        List<String> imageUrlList = getImageSrc(htmlContent);

        for(String imageUrl:imageUrlList){
            try {
                download(imageUrl, IMAGE_SAVE_PATH);
            } catch (Exception e) {
                System.out.println(e.getMessage());
            }
        }

        System.out.println("从【"+REQUEST_URL+"】网站，共抓取【"+(imageIndex-1)+"】张图片。");
    }



    /***
     * 解析图片url路径，保存到对应目录下
     * @param oldUrl 图片链接url
     * @param savePath 图片报错路径
     * @throws Exception
     */
    public static void download(String oldUrl,String savePath) throws Exception {

        String imageType = "";
        boolean flag = false;
        for(String suffix:IMAGE_TYPE_SUFFIX){
            if(oldUrl.lastIndexOf(suffix) > -1 || oldUrl.lastIndexOf(suffix.toUpperCase()) > -1){
                flag = true;
                imageType = suffix.replace("=", ".");
                break;
            }
        }
        //图片类型存在
        if(flag){
            String filename = String.valueOf(imageIndex) + imageType;
            download(oldUrl, filename, savePath);
            imageIndex ++ ;
        }
    }

    /*****
     * 根据图片url路径，下载到对应目录下
     * @param urlString 图片url路径
     * @param filename  文件名称
     * @param savePath  文件报错路径
     * @throws Exception
     */
    public static void download(String urlString, String filename, String savePath) throws Exception {

        if(StringUtils.isEmpty(urlString) || StringUtils.isEmpty(filename) || StringUtils.isEmpty(savePath)){
            throw new IllegalArgumentException("方法入参不能为空！");
        }
        //目录如果不存在，则新增
        File dir = new File(savePath);
        if(!dir.exists() && dir.isDirectory()){
            dir.mkdirs();
        }
        // 构造URL
        URL url = new URL(urlString);
        // 打开连接
        URLConnection con = url.openConnection();
        // 设置请求超时为5s
        con.setConnectTimeout(5 * 1000);
        // 输入流
        InputStream is = con.getInputStream();

        // 1K的数据缓冲
        byte[] bs = new byte[1024];
        // 读取到的数据长度
        int len;
        // 输出的文件流
        File sf = new File(savePath);
        if (!sf.exists()) {
            sf.mkdirs();
        }
        OutputStream os = new FileOutputStream(sf.getPath() + "/" + filename);
        // 开始读取
        while ((len = is.read(bs)) != -1) {
            os.write(bs, 0, len);
        }
        // 完毕，关闭所有链接
        os.close();
        is.close();
    }



    /****
     * 通过httpclient，读取url中的响应内容并返回 
     * @param url 请求的url路径
     * @return 
     */
    public static String  parseContext(String url) {

        if(StringUtils.isEmpty(url)){
            throw new IllegalArgumentException("访问地址url不能为空");
        }

        String html = null;
        // 创建httpclient对象
        CloseableHttpClient httpclient = HttpClients.createDefault();
        try {
            // 创建httpget对象
            HttpGet httpGet = new HttpGet(url);
            // 执行get请求.
            CloseableHttpResponse response = httpclient.execute(httpGet);
            try {
                // 获取响应实体
                HttpEntity entity = response.getEntity();
                if (entity != null) {
                    html = EntityUtils.toString(entity);
                }
            } finally {
                response.close();
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭连接,释放资源
            try {
                httpclient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return html;
    }



    /*** 
     * 获取ImageUrl地址 
     *  
     * @param htmlContext 
     * @return 
     */  
    private static List<String> getImageUrl(String htmlContext) {  

        if(StringUtils.isEmpty(htmlContext)){
            throw new IllegalArgumentException("html请求内容不能为空.");
        }

        List<String> listImgUrl = new ArrayList<String>();  

        Matcher matcher = Pattern.compile(IMGURL_REG).matcher(htmlContext);  

        while (matcher.find()) {  
            listImgUrl.add(matcher.group().replaceAll("'", ""));  
        }  

        return listImgUrl;  
    }  

    /*** 
     * 获取ImageSrc地址 
     *  
     * @param htmlContext 
     * @return 
     */  
    public static  List<String> getImageSrc(String htmlContext) {  

        if(StringUtils.isEmpty(htmlContext)){
            throw new IllegalArgumentException("html请求内容不能为空.");
        }
        List<String> listImageUrl = getImageUrl(htmlContext);

        List<String> listImgSrc = new ArrayList<String>();  

        for (String imageContext : listImageUrl) {  
            Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(imageContext);  
            while (matcher.find()) {  
                listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));  
            }  
        }  
        return listImgSrc;  
    } 


}

说明：需要引入httpclient和commons-lang两个jar包
我的项目是通过maven管理，所以只需要在pom.xml中添加以下配置即可

<!-- apache开源组织的jar包 -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.4</version>
        </dependency>
        <!-- apache提供的工具jar，包含 字符串，数字、反射等工具类-->
        <dependency>  
          <groupId>commons-lang</groupId>  
          <artifactId>commons-lang</artifactId>  
          <version>2.6</version>  
        </dependency>

测试效果
1、修改常量
修改RQQUEST_URL 和IMAGE_SAVE_PATH 两个常量值改成你想抓取的网址url和保存图片的路径即可

/***
     * 请求的网址url常量
     */
public static final String RQQUEST_URL = "https://www.cnblogs.com/EasonJim/p/6919369.html";
    /****
     * 图片保存路径
     */
public static final String IMAGE_SAVE_PATH = "C:\\Users\\youqiang.xiong\\Desktop\\image\\test";

2、运行main方法
3、等待不久，Console控制台会输出一段信息

从【https://www.cnblogs.com/EasonJim/p/6919369.html】网站，共抓取【7】张图片。

4、打开C:\Users\youqiang.xiong\Desktop\image\test 目录查看图片是否成功生成

TODO
以上功能还有一些需要完善和优化的地方，由于时间有限这里还没有太多时间去研究，后续会进一步补充。
1 加入多线程，同时抓取多个网站的图片
2 利用Java swing技术开发图形界面，供普通用户使用