OkHttpClient和Jsoup进行网页爬取
通过http请求,返回一个json格式的数据,然后将json数据转化为java对象返回给调用方。Http采用OkHttp库,json转化采用fastjson库。
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 让客户满意是我们工作的目标,不断超越客户的期望值来自于我们对这个行业的热爱。我们立志把好的技术通过有效、简单的方式提供给客户,将通过不懈努力成为客户在信息化领域值得信任、有价值的长期合作伙伴,公司提供的服务项目有:申请域名、雅安服务器托管、营销软件、网站建设、长子网站维护、网站推广。
org.MongoDB
bson
3.6.4
com.google.cloud.trace.instrumentation.jdbc
driver
0.1.1
pom
ch.qos.logback.contrib
logback-mongodb-access
0.1.5
org.mongodb
mongo-java-driver
3.0.0
com.squareup.okio
okio
1.11.0
com.squareup.okhttp3
okhttp
3.6.0
com.alibaba
fastjson
1.2.47
junit
junit
3.8.1
test
package com.ok.http.client.okhttp;
import java.util.List;
import java.util.Map;
public class ExecuteTask {
public static void main(String[] args) throws Exception {
// 调用downloadHtml下载网页
CrawlData crawlData = new CrawlData();
String url = null;
url = "http://top.chinaz.com/all/index.html";
System.out.println("开始爬取,请等待.");
String htmlBody = crawlData.downloadHtml(url);
System.out.println("爬取成功");
// 将下载的数据进行分析
List
package com.ok.http.client.okhttp;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
/**
- Hello world!
*/
public class CrawlData
{
public static String downloadHtml(String url) {
String body = null;
OkHttpClient client = new OkHttpClient();
//请求
Request request = new Request.Builder().url(url).build();
//发起请求
try {Response response = client.newCall(request).execute(); body = new String(response.body().bytes()); } catch (Exception e) { e.printStackTrace(); } return body;//取得目标
}
}package com.ok.http.client.okhttp;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;
public class Analysis {
/**
- 解析数据
- @param htmlBody
- @return
@throws IOException
*/
public static List/**
- 获取备案信息
- @param url
- @return
- @throws Exception
*/
private static String getGecordInformation(String url) throws Exception {
String htmlBody = CrawlData.downloadHtml(url);
if (htmlBody != null) {
Document doc = Jsoup.parse(htmlBody);
String stringecordInformation = doc.select("li.TMain06List-Left>p").text();
return stringecordInformation;
}
return null;
}
}package com.ok.http.client.okhttp;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import org.bson.Document;
import java.util.Map;
public class StoreData{
public static void adds(Map dataMap){
try{
// 连接到 mongodb 服务String siteRank
MongoClient mongoClient = new MongoClient( "localhost" , 27017 );
// 连接到数据库
MongoDatabase mongoDatabase = mongoClient.getDatabase("sit_rank");
System.out.println(mongoDatabase);
System.out.println("成功连接数据库");
MongoCollection collection = mongoDatabase.getCollection("information");
System.out.println(collection);
System.out.println("集合 information 选择成功");
//插入文档
/**
* 1. 创建文档 org.bson.Document 参数为key-value的格式
* 2. 创建文档集合List
* 3. 将文档集合插入数据库集合中 mongoCollection.insertMany(List) 插入单个文档可以用 mongoCollection.insertOne(Document)
* */
String siteName=null;String domainName=null;String AlexaRank=null;String Synopsis=null;
String score=null;String siteRank=null;String webSite=null;String RecordInformation=null;
JSONObject josn = JSONObject.parseObject(dataMap.toString());
Document document = new Document(josn);
document.put("_id",siteName);
document.append("domainName", domainName);
document.append("AlexaRank",AlexaRank);
document.append("Synopsis",Synopsis);
document.append("score",score);
document.append("siteRank",siteRank);
document.append("webSite",webSite);
document.append("RecordInformation",RecordInformation);
collection.insertOne(document);
System.out.println("文档插入成功");
//关闭mongodb连接
mongoClient.close();
System.out.println("MongoDB连接已关闭");
}catch(Exception e){
System.err.println( e.getClass().getName() + ": " + e.getMessage() );
}
}
}
网页标题:OkHttpClient和Jsoup进行网页爬取
新闻来源:http://pcwzsj.com/article/gecjii.html