在 Styletrip 專案內,我們會抓所有景點資料的相關部落格,然後利用部落格的內容做近一步的分析,所以我們需要有能力把抓到的部落格網頁取出正文內容出來。

網頁正文,就是我們平常所看部落格文章最主要的內容區塊,而網頁正文提取的演算法有基於不同的方法來做提取,各方法的準確度都不同,也會因為網頁結構不同而有不一樣的結果。
這邊我是採用 VIPS: Vision based Page Segmentation Algorithm, 這個演算法是基於視覺結構來做判斷,也就是可以想成網頁正文通常都是那個網頁裡面看起來最大的區域。
這邊提供最核心、主要的實做方式:
public static BlockProperties parseWebContent(Parser parser) throws ParserException { | |
NodeList visualBlockNodeList = getVisualBlock(parser); | |
// Output.printNodeList( visualBlockNodeList ); | |
NodeList linkNodeList = findLinkBlock(visualBlockNodeList); | |
NodeList invalidNodeList = findInvalidBlock(visualBlockNodeList); | |
NodeList actionNodeList = findActionBlock(visualBlockNodeList); // NOTE: 因為動作標籤可能沒有包含文字,所以要獨立出來找 | |
Map<String, NodeList> blockNodeMap = new HashMap<String, NodeList>(); | |
blockNodeMap.put(VISUAL_BLOCK, visualBlockNodeList); | |
blockNodeMap.put(LINK_BLOCK, linkNodeList); | |
blockNodeMap.put(INVALID_BLOCK, invalidNodeList); | |
blockNodeMap.put(ACTION_BLOCK, actionNodeList); | |
List<BlockProperties> blockPropertiesList = getBlockProperties(blockNodeMap); | |
Map<BlockProperties, Double> propMap = new TreeMap<BlockProperties, Double>(); | |
for (int i = 0; i < blockPropertiesList.size(); i++) { | |
BlockProperties blockProp = blockPropertiesList.get(i); | |
// /* | |
if (blockProp.getProperties().equals(NORMAL_BLOCK) && blockProp.getBlockText().length() <= 0) | |
continue; | |
// v1: 0.3 | |
else if ((blockProp.getProperties().equals(LINK_BLOCK) && blockProp.getBlockTextRatio() >= 0.4) || | |
blockProp.getSubLinkTextRatio() >= 0.45) | |
continue; | |
// v1: 0.4 | |
else if ((blockProp.getProperties().equals(INVALID_BLOCK) && blockProp.getBlockTextRatio() >= 0.65) || | |
blockProp.getSubInvalidTextRatio() >= 0.4) | |
continue; | |
else if (blockProp.getProperties().equals(ACTION_BLOCK) || blockProp.getSubActionBlock() > 0) | |
continue; | |
// */ | |
// blockProp.print(); | |
// 找出正文區塊的計算特徵值公式 | |
double linkInvalidTextLen = blockProp.getSubInvalidTextLength() + blockProp.getSubLinkTextLength(); | |
double normalTextLen = blockProp.getBlockText().length() * (1.0 - blockProp.getBlockTextRatio()) - linkInvalidTextLen; | |
if (linkInvalidTextLen <= 0) | |
linkInvalidTextLen = 1.0; // 為了除法,如果等於零要轉成1 | |
double linkInvalidSubBlockNum = blockProp.getSubLinkBlock() + blockProp.getSubInvalidBlock(); | |
double normalSubBlock = (double) (blockProp.getSubBlockNum() - linkInvalidSubBlockNum); | |
if (linkInvalidSubBlockNum <= 0) | |
linkInvalidSubBlockNum = 1.0; | |
double weight = Math.pow(normalTextLen, 5) / (double) blockProp.getBlockText().length(); | |
weight /= Math.pow(10.0, 5); | |
// 子區块都是連結或是無效區块 | |
if (normalSubBlock == 0 && blockProp.getSubBlockNum() != 0) | |
weight /= (10.0 * Math.pow(blockProp.getSubBlockNum(), 2)); | |
else if (normalSubBlock != 0 && blockProp.getSubBlockNum() != 0) | |
weight *= (normalSubBlock / Math.pow(blockProp.getSubBlockNum(), 2)); | |
if (blockProp.getProperties().equals(NORMAL_BLOCK)) | |
weight *= 3.0; | |
else if (blockProp.getProperties().equals(INVALID_BLOCK)) | |
weight *= 1.2; | |
else if (blockProp.getProperties().equals(LINK_BLOCK)) | |
weight *= 1.8; | |
// 由視覺區块的class或id來判斷,包含article和content的字眼可提高權重值 | |
CompositeTag blockTag = (CompositeTag) blockProp.getBlockNode(); | |
String className = blockTag.getAttribute("class"); | |
String idName = blockTag.getAttribute("id"); | |
String checkName = null; | |
if (idName != null) | |
checkName = idName.toLowerCase(); | |
else if (className != null) | |
checkName = className.toLowerCase(); | |
if (checkName != null) { | |
checkName = checkName.toLowerCase().trim(); | |
if (!(checkName.contains("footer") || | |
checkName.contains("header") || | |
checkName.contains("counter") || | |
checkName.contains("banner")) || | |
checkName.contains("widget")) { | |
// System.out.println( "*Weight=" + weight ); | |
if ((checkName.contains("body") && | |
checkName.contains("post")) || | |
(checkName.contains("entry") && | |
checkName.contains("content")) || | |
checkName.contains("innertext") || | |
(checkName.contains("content") && | |
checkName.contains("article"))) | |
weight *= 1000.0; | |
else if (checkName.contains("content")) | |
weight *= 50.0; | |
else if (checkName.contains("article")) | |
weight *= 10.0; | |
else if (checkName.contains("text")) | |
weight *= 5.0; | |
// 有id的再加分 | |
if (idName != null) | |
weight *= 100.0; | |
// System.out.println( "Weight'=" + weight ); | |
} | |
} | |
propMap.put(blockProp, weight); | |
// System.out.println( "\t*Weight=" + weight ); | |
propMap = MapUtils.sortByValue(propMap, true); | |
} | |
int count = 0; // 為了取得第一個BlockProperties用的 | |
BlockProperties contentProp = null; | |
int commentIndex = 0; // 用來儲存回應的區塊索引,所有在回應以下的區塊都不能成為正文區块 | |
for (BlockProperties prop : propMap.keySet()) { | |
if (prop.getBlockText().length() <= 0) | |
continue; | |
try { | |
// 用來去掉回應區塊 | |
CompositeTag propNode = (CompositeTag) prop.getBlockNode(); | |
String className = propNode.getAttribute("class"); | |
String idName = propNode.getAttribute("id"); | |
String checkName = null; | |
if (idName != null) | |
checkName = idName; | |
else if (className != null) | |
checkName = className; | |
if (checkName != null) { | |
if (checkName.contains("comment") || checkName.contains("reply")) { | |
commentIndex = blockPropertiesList.indexOf(prop); | |
continue; | |
} else if (containTrimClassID(propNode)) | |
continue; | |
} | |
} | |
catch (NullPointerException e) { | |
e.printStackTrace(); | |
} | |
if (commentIndex != 0 && blockPropertiesList.indexOf(prop) > commentIndex) | |
continue; | |
if (count == 0) { | |
// System.out.println( "\n\n--> Wegith= " + propMap.get( prop ) ); prop.print(); | |
contentProp = prop; | |
count++; | |
} else | |
break; | |
} | |
// 找到正文區塊還有其子區块,判斷是否包含連結區塊或是特定class, id名稱,然後過濾掉。 | |
Node currentNode = contentProp.getBlockNode(); | |
String contentHtml = | |
org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toHtml(), "\\s+", " ").toLowerCase().trim(); | |
// currentNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim(); | |
String contentText = WebCrawler.filterSpecialSymbol( | |
org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim()); | |
// currentNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() ); | |
// System.out.println( contentText.length() ); | |
String checkContentHtml = contentHtml; // check開頭的變數是給迴圈判斷正文區塊的子區塊用的 | |
String checkContentText = contentText; // 因為在迴圈中contentHtml和contentText的字串會變動,所以無法拿來判斷子區塊 | |
Map<String, Integer> trimTextMap = new TreeMap<String, Integer>(); | |
for (int i = visualBlockNodeList.indexOf(contentProp.getBlockNode()) + 1; | |
i < visualBlockNodeList.size(); i++ | |
) { | |
CompositeTag nextNode = (CompositeTag) visualBlockNodeList.elementAt(i); | |
String nextHtml = | |
org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toHtml(), "\\s+", " ").toLowerCase().trim(); | |
// nextNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim(); | |
String nextText = WebCrawler.filterSpecialSymbol( | |
org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim()); | |
// nextNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() ).trim(); | |
if (checkContentHtml.contains(nextHtml) && | |
checkContentText.contains(nextText)) { | |
// System.out.println( "*" + nextHtml ); | |
// System.out.println( "\t" + nextText ); | |
// System.out.println( nextText.length() ); | |
if (containTrimClassID(nextNode)) { | |
// 預防過濾掉整個正文字串 | |
if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) { | |
// System.out.printf("\t過濾掉:%s\n", nextText ); | |
trimTextMap.put(nextHtml, nextText.length()); | |
} | |
} | |
if (linkNodeList.contains(nextNode)) { | |
int linkTextLen = getLinkTextLength(nextNode); | |
double linkTextRatio = (double) linkTextLen / (double) nextText.length(); | |
if (linkTextRatio >= 0.65) { | |
if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) { | |
trimTextMap.put(nextHtml, nextText.length()); | |
// System.out.println( "\t連結區: " + linkTextRatio ); | |
} | |
} | |
} | |
} else | |
break; | |
} | |
/** | |
* 不在上面直接過濾而且是由字串長的過濾到短的是因為 | |
* 如果要過濾的字串是很短的話,過濾掉後會影響到長字串的過濾 | |
* ex: 「我今天去了淡水,還有去淡水老街。」 | |
* 先過濾「淡水」 ==> 「我今天去了,還有去老街。」 | |
* 再過濾「淡水老街」 ==> 無法過濾掉「淡水老街」,因為字串剩下「老街」 | |
* 過濾不完全!! | |
* 但如果先過濾掉「淡水老街」 ==> 「我今天去了淡水,還有去。」 | |
* 在過濾掉「淡水」 ==> 「我今天去了,還有去。」 | |
* 完全過濾!! | |
*/ | |
trimTextMap = MapUtils.sortByValue(trimTextMap, true); | |
for (String html : trimTextMap.keySet()) { | |
// System.out.println( "Filter=" + html ); | |
contentHtml = contentHtml.replace(html, ""); | |
} | |
contentProp.setBlockHtml( | |
org.apache.commons.lang3.StringUtils.replacePattern( | |
org.apache.commons.lang3.StringUtils.replacePattern(contentHtml, HtmlTag.BR_REPLACE_REGEX, " "), HtmlTag.STYLE_REPLACE_REGEX, "")); | |
return contentProp; | |
} |
Demo
Leave a Reply