博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Jsoup解析HTML
阅读量:6157 次
发布时间:2019-06-21

本文共 7483 字,大约阅读时间需要 24 分钟。

hot3.png

package net.sc.common.util;import java.io.File;import java.util.ArrayList;import java.util.LinkedHashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Attribute;import org.jsoup.nodes.Attributes;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.nodes.Node;import org.jsoup.select.Elements;/** * @author Aaron * @createTime 2014-07-07 * @desc 支持多属性的选择 */public class JsoupUtil {	public static final String _REGEX = "\\[(?
\\w+)=(?
\\w+)\\]"; Pattern p; public JsoupUtil() { p = Pattern.compile(_REGEX); } public Elements findElement(Element queryElement, String... attrs) { return this.findElement(queryElement, false, attrs); } // TODO only只匹配了等于的情况,未匹配开始和结束的情况 public Elements findElement(Element queryElement, boolean only, String... attrs) { Elements result = new Elements(); if (queryElement == null) { return result; } if (attrs.length == 0) { return result; } Elements searchElements = new Elements(queryElement); Elements middleResults = new Elements(); for (int i = 0; i < attrs.length; i++) { for (Element search : searchElements) { middleResults.addAll(search.select(attrs[i])); } if (middleResults.size() == 0 && i < attrs.length - 1) { return result; } searchElements = middleResults; middleResults = new Elements(); } result = searchElements; if (!only) { return result; } Map
searchAttrs = new LinkedHashMap
(); for (int i = 0; i < attrs.length; i++) { Matcher m = p.matcher(attrs[i]); // 未发现属性的情况下,返回原结果集 if (m.find()) { searchAttrs.put(m.group("name"), m.group("value")); } else { return result; } } result = new Elements(); for (Element element : searchElements) { Attributes attriList = element.attributes(); if (attriList.size() != searchAttrs.size()) { continue; } boolean pass = false; for (Attribute attri : attriList) { if (!searchAttrs.containsKey(attri.getKey())) { pass = true; break; } } if (pass) { continue; } result.add(element); } return result; } // 获取子元素的相关数据 public Map
getPropertyEle(Elements eles) { Map
map = new LinkedHashMap<>(); for (Element ele : eles) { Elements childEles = ele.children(); if (childEles.size() != 2) { continue; } String name = childEles.get(0).text(), value = childEles.get(1).text(); if (name.endsWith(":") || name.endsWith(":")) { name = name.substring(0, name.length() - 1); } map.put(name.trim(), value.trim()); } return map; } // 获取以冒号分隔的相关数据 public Map
getPropertyColon(List
nodeList) { Map
map = new LinkedHashMap<>(); for (Node node : nodeList) { String text = node.toString(); String name = "", value = ""; if (text.indexOf(":") != -1) { name = text.substring(0, text.indexOf(":")); value = text.substring(text.indexOf(":") + 1); } else if (text.indexOf(":") != -1) { name = text.substring(0, text.indexOf(":")); value = text.substring(text.indexOf(":") + 1); } if (StringUtil.isEmpty(name)) { continue; } if (name.indexOf("onclick=") != -1) { continue; } map.put(name.trim(), value.trim()); } return map; } // 获取以冒号分隔的相关数据 public Map
getPropertyColon(Elements eles) { Map
map = new LinkedHashMap<>(); for (Element ele : eles) { String text = ele.text(); String name = "", value = ""; if (text.indexOf(":") != -1) { name = text.substring(0, text.indexOf(":")); value = text.substring(text.indexOf(":") + 1); } else if (text.indexOf(":") != -1) { name = text.substring(0, text.indexOf(":")); value = text.substring(text.indexOf(":") + 1); } if (StringUtil.isEmpty(name)) { continue; } map.put(name.trim(), value.trim()); } return map; } public Map
getTableColumnData(Element table) { return this.getTableColumnData(table, ""); } public Map
getTableColumnData(Element table, String rowSelectRange) { Map
map = new LinkedHashMap<>(); Elements trs = table.select("tr"); if (!StringUtil.isEmpty(rowSelectRange)) { String[] deleteRows = rowSelectRange.split(","); System.out.println(deleteRows); int offsetIndex = 0; for (int i = deleteRows.length - 1; i >= 0; i--) { int index = Integer.parseInt(deleteRows[i]); if (index < 0) { index = Math.abs(index); index = trs.size() - (index - offsetIndex); trs.remove(index); offsetIndex++; } else { trs.remove(index - 1); } } } for (Element tr : trs) { Elements tds = tr.select("td"); // th 和 td 混合的情况下,取子元素 if (tr.select("th").size() > 0) { tds = tr.children(); } int index = 0; String name = ""; for (Element td : tds) { index++; if (index % 2 == 0) { if (StringUtil.isEmpty(name)) { continue; } map.put(name, td.text().trim()); } else { name = td.text(); if (name.endsWith(":") || name.endsWith(":")) { name = name.substring(0, name.length() - 1); } name = name.trim(); } } } return map; } public List
> getTableRowData(Element table) { return getTableRowData(table, null, "", ""); } public List
> getTableRowData(Element table, String rowSelectRange, String columnSelectRange) { return getTableRowData(table, null, rowSelectRange, columnSelectRange); } // rowSelectRange 表示要去除的行 public List
> getTableRowData(Element table, List
selfNameList, String rowSelectRange, String columnSelectRange) { Elements elements = table.select("tr"); if (!StringUtil.isEmpty(rowSelectRange)) { String[] deleteRows = rowSelectRange.split(","); int offsetIndex = 0; for (int i = deleteRows.length - 1; i >= 0; i--) { int index = Integer.parseInt(deleteRows[i]); if (index < 0) { index = Math.abs(index); index = elements.size() - (index - offsetIndex); elements.remove(index); offsetIndex++; } else { elements.remove(index - 1); } } } int counter = 0; List
nameList = new ArrayList<>(); if (selfNameList != null && selfNameList.size() > 0) { nameList = selfNameList; } List
> valueList = new ArrayList<>(); for (Element element : elements) { counter++; Elements tds = element.select("td"); if (tds == null || tds.size() == 0) { tds = element.select("th"); } if (!StringUtil.isEmpty(columnSelectRange)) { String[] deleteColumns = columnSelectRange.split(","); int offsetIndex = 0; for (int i = deleteColumns.length - 1; i >= 0; i--) { int index = Integer.parseInt(deleteColumns[i]); if (index < 0) { index = Math.abs(index); index = tds.size() - (index - offsetIndex); tds.remove(index); offsetIndex++; } else { tds.remove(index - 1); } } } Map
pvm = new LinkedHashMap<>(); int index = 0; for (Element td : tds) { if (counter == 1 && (selfNameList == null || selfNameList.size() == 0)) { nameList.add(td.text().trim()); } else if (counter == 1 && selfNameList != null && selfNameList.size() > 0) { pvm.put(nameList.get(index), td.text()); } else { pvm.put(nameList.get(index), td.text()); } index++; } if (pvm.size() > 0) { valueList.add(pvm); } } return valueList; } public static void main(String args[]) throws Exception {// JsoupUtil ju = new JsoupUtil();// String path = ju.getClass().getResource("").getPath() + "JsoupUtil.html";// Document doc = Jsoup.parse(new File(path), "UTF-8");// Elements results = ju.findElement(doc, true, "tr[width=100]");// for (Element result : results) {// System.out.println(result.text());// } // JsoupUtil ju = new JsoupUtil();// String path = ju.getClass().getResource("").getPath() + "JsoupUtil.html";// Document doc = Jsoup.parse(new File(path), "UTF-8");// List
> list = ju.getTableRowData(doc.select("table[class=tb6]").get(0));// System.out.println(list); }}

 

转载于:https://my.oschina.net/AaronDMC/blog/751203

你可能感兴趣的文章
极客技术专题【009期】:web技术开发小技巧
查看>>
PHP 简单计算器代码实现
查看>>
正则表达式的知识普及
查看>>
docker使用笔记
查看>>
华为eNSP模拟器上实现FTP服务
查看>>
【全球AI人才排行榜】美国第一,中国仅排名第7
查看>>
微信小程序输入框input
查看>>
MySql字符串函数使用技巧
查看>>
Doc2Vec,Word2Vec文本相似度 初体验。
查看>>
系统ghost后变成一个盘了别的分区的文件怎么找回
查看>>
Win7+Ubuntu11
查看>>
请问华为三层交换机里面的那个从IP是个什么意思? -
查看>>
kFeedback开源啦
查看>>
大数据传输,文件传输的专业解决方案!
查看>>
阿里云专家穆轩的《杭州九年程序员之“修炼”手册》
查看>>
JQuery:deferred对象的方法
查看>>
eyoucms问答 百度权重是什么
查看>>
win10中遇到qq视频时摄像头打不开没反应的解决方法
查看>>
介绍自己的一个Android插桩热修复框架项目QuickPatch
查看>>
关于textarea的ie9的maxlength不起作用的问题,请参考如下URL解决。
查看>>