OkHttpClient和Jsoup进行网页爬取

通过http请求,返回一个json格式的数据,然后将json数据转化为java对象返回给调用方。Http采用OkHttp库,json转化采用fastjson库。

xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0

让客户满意是我们工作的目标,不断超越客户的期望值来自于我们对这个行业的热爱。我们立志把好的技术通过有效、简单的方式提供给客户,将通过不懈努力成为客户在信息化领域值得信任、有价值的长期合作伙伴,公司提供的服务项目有:申请域名雅安服务器托管、营销软件、网站建设、长子网站维护、网站推广。

com.ok.http.client
okhttp
0.0.1-SNAPSHOT
jar

okhttp
http://maven.apache.org


UTF-8



org.jsoup
jsoup
1.11.3


    org.MongoDB
    bson
    3.6.4



    com.google.cloud.trace.instrumentation.jdbc
    driver
    0.1.1
    pom


    ch.qos.logback.contrib
    logback-mongodb-access
    0.1.5



    org.mongodb
    mongo-java-driver
    3.0.0


    com.squareup.okio
    okio
    1.11.0



    com.squareup.okhttp3
    okhttp
    3.6.0



    com.alibaba
    fastjson
    1.2.47



  junit
  junit
  3.8.1
  test


package com.ok.http.client.okhttp;

import java.util.List;
import java.util.Map;

public class ExecuteTask {
public static void main(String[] args) throws Exception {
// 调用downloadHtml下载网页
CrawlData crawlData = new CrawlData();
String url = null;
url = "http://top.chinaz.com/all/index.html";
System.out.println("开始爬取,请等待.");
String htmlBody = crawlData.downloadHtml(url);
System.out.println("爬取成功");
// 将下载的数据进行分析
List> dataList = Analysis.analysisData(htmlBody);
System.out.println("数据解析成功");
for (Map data : dataList) {
StoreData.adds(data);
System.out.println("存储成功");
}
}
}

package com.ok.http.client.okhttp;

import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;

/**

  • Hello world!
  • */
    public class CrawlData
    {
    public static String downloadHtml(String url) {
    String body = null;
    OkHttpClient client = new OkHttpClient();
    //请求
    Request request = new Request.Builder().url(url).build();
    //发起请求
    try {

        Response response = client.newCall(request).execute();
        body = new String(response.body().bytes());
    
    } catch (Exception e) {
        e.printStackTrace();
    }
    return body;//取得目标

    }
    }

    package com.ok.http.client.okhttp;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;

public class Analysis {
/**

  • 解析数据
  • @param htmlBody
  • @return
  • @throws IOException
    */
    public static List> analysisData(String htmlBody) throws Exception {
    // 获取目标HTML代码
    List> list = new ArrayList>();
    Document doc = Jsoup.parse(htmlBody);
    Elements elements = doc.select("ul.listCentent").select("li");
    System.out.println(" 数据集合大小=====" + elements.size());
    for (Element elmemt : elements) {
    Map map1 = new HashMap();
    // 获取公司名
    String siteName = elmemt.select("div.CentTxt > h4.rightTxtHead > a").text();
    System.out.println("siteName=====" + siteName);
    // 获取域名
    String domainName = elmemt.select("div.CentTxt > h4.rightTxtHead > span").text();
    System.out.println("domainName=====" + domainName);
    // 获取AlexaRank排名
    String AlexaRank = elmemt.select("li.clearfix >div.CentTxt > div.RtCPart >p").text();
    System.out.println("AlexaRank=====" + AlexaRank);
    // 获取公司简介
    String Synopsis = elmemt.select("div.CentTxt> p").text();
    System.out.println("公司简介====" + Synopsis);
    // 获取得分
    String score = elmemt.select("div.RtCRateCent>span").text();
    System.out.println(score);
    // 获取排名
    String siteRank = elmemt.select("div.RtCRateCent> strong").text();
    System.out.println("排名:" + siteRank);
    // 获取网址
    String webSite = "http://top.chinaz.com" + elmemt.select("a").first().attr("href");
    System.out.println("网址:" + webSite);
    // 获取备案信息
    String stringecordInformation = getGecordInformation(webSite);
    System.out.println("备案信息" + stringecordInformation);
    System.out.println("\t");
    // StoreData.add(siteName,domainName, AlexaRank , Synopsis, score, siteRank, webSite ,RecordInformation);
    map1.put("siteName", siteName);
    map1.put("domainName", domainName);
    map1.put("AlexaRank", AlexaRank);
    map1.put("公司简介", Synopsis);
    map1.put("排名", siteRank);
    map1.put("网址", webSite);
    map1.put("备案信息", stringecordInformation);
    list.add(map1);
    }
    return list;
    }

    /**

  • 获取备案信息
  • @param url
  • @return
  • @throws Exception
    */
    private static String getGecordInformation(String url) throws Exception {
    String htmlBody = CrawlData.downloadHtml(url);
    if (htmlBody != null) {
    Document doc = Jsoup.parse(htmlBody);
    String stringecordInformation = doc.select("li.TMain06List-Left>p").text();
    return stringecordInformation;
    }
    return null;
    }
    }

    package com.ok.http.client.okhttp;
    import com.alibaba.fastjson.JSONObject;
    import com.mongodb.MongoClient;
    import com.mongodb.client.MongoCollection;
    import com.mongodb.client.MongoDatabase;
    import org.bson.Document;
    import java.util.Map;

public class StoreData{

public static void adds(Map dataMap){
    try{
        // 连接到 mongodb 服务String siteRank
        MongoClient mongoClient = new MongoClient( "localhost" , 27017 );
        // 连接到数据库
        MongoDatabase mongoDatabase = mongoClient.getDatabase("sit_rank");
        System.out.println(mongoDatabase);
        System.out.println("成功连接数据库");

        MongoCollection collection = mongoDatabase.getCollection("information");
        System.out.println(collection);
        System.out.println("集合 information 选择成功");
        //插入文档
        /**
         * 1. 创建文档 org.bson.Document 参数为key-value的格式
         * 2. 创建文档集合List
         * 3. 将文档集合插入数据库集合中 mongoCollection.insertMany(List) 插入单个文档可以用 mongoCollection.insertOne(Document)
         * */
        String siteName=null;String domainName=null;String AlexaRank=null;String Synopsis=null;
                String score=null;String siteRank=null;String webSite=null;String RecordInformation=null;
        JSONObject josn = JSONObject.parseObject(dataMap.toString());
                Document document = new Document(josn);
        document.put("_id",siteName);
        document.append("domainName", domainName);
        document.append("AlexaRank",AlexaRank);
        document.append("Synopsis",Synopsis);
        document.append("score",score);
        document.append("siteRank",siteRank);
        document.append("webSite",webSite);
        document.append("RecordInformation",RecordInformation);
        collection.insertOne(document);
        System.out.println("文档插入成功");
        //关闭mongodb连接
        mongoClient.close();
        System.out.println("MongoDB连接已关闭");
    }catch(Exception e){
        System.err.println( e.getClass().getName() + ": " + e.getMessage() );
    }
}

}


网页标题:OkHttpClient和Jsoup进行网页爬取
新闻来源:http://pcwzsj.com/article/gecjii.html