|
@@ -0,0 +1,54 @@
|
|
|
+package com.sf.util;
|
|
|
+
|
|
|
+import com.sf.util.vo.HtmlVo;
|
|
|
+import org.jsoup.Jsoup;
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
+import org.jsoup.select.Elements;
|
|
|
+import org.springframework.data.util.Pair;
|
|
|
+
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.List;
|
|
|
+
|
|
|
+public class SpiderUtils {
|
|
|
+
|
|
|
+ public static void main(String[] args) throws Exception{
|
|
|
+ String url = "https://www.qidian.com/book/68223/";
|
|
|
+ Document document = Jsoup.connect(url).get();
|
|
|
+
|
|
|
+ String desc = document.select("p#book-intro-detail").text();
|
|
|
+// Elements elements = document.select("ul.volume-chapters>li");
|
|
|
+ Elements elements = document.select("li.chapter-item > a");
|
|
|
+ // 两种方式:
|
|
|
+ // 1) 依次拿到每个li -> 取出text作为章节名 -> 取出href访问具体的章节内容 以此类推
|
|
|
+ // 2) 先拿到所有li的文本信息和href 放到list中 然后依次访问href 相当于批量处理
|
|
|
+
|
|
|
+ // 数据结构 text href
|
|
|
+// List<HtmlVo> list = new ArrayList<>();
|
|
|
+// Pair<String,String> pair = Pair.of("","");
|
|
|
+ List<Pair<String,String>> pairList = new ArrayList<>();
|
|
|
+
|
|
|
+ for (Element element : elements) {
|
|
|
+ String text = element.text();
|
|
|
+ String linkHref = element.attr("href");
|
|
|
+ linkHref = "https:" + linkHref;
|
|
|
+ Pair<String,String> pair = Pair.of(text,linkHref);
|
|
|
+ pairList.add(pair);
|
|
|
+// System.out.println();
|
|
|
+ }
|
|
|
+ System.out.println();
|
|
|
+
|
|
|
+ List<Pair<String,String>> contentList = new ArrayList<>();
|
|
|
+ for (Pair<String, String> pair : pairList) {
|
|
|
+ String text = pair.getFirst();
|
|
|
+ String href = pair.getSecond();
|
|
|
+ Document subDoc = Jsoup.connect(href).get();
|
|
|
+ String content = subDoc.select("main").html();
|
|
|
+ Pair<String,String> contentPair = Pair.of(text,content);
|
|
|
+ contentList.add(contentPair);
|
|
|
+ Thread.sleep(200);
|
|
|
+ System.out.println();
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+}
|