網頁正文提取演算法

在 Styletrip 專案內,我們會抓所有景點資料的相關部落格,然後利用部落格的內容做近一步的分析,所以我們需要有能力把抓到的部落格網頁取出正文內容出來。

擷取選取區域_004
紅色框起來的地方就是網頁正文

網頁正文,就是我們平常所看部落格文章最主要的內容區塊,而網頁正文提取的演算法有基於不同的方法來做提取,各方法的準確度都不同,也會因為網頁結構不同而有不一樣的結果。

這邊我是採用 VIPS: Vision based Page Segmentation Algorithm, 這個演算法是基於視覺結構來做判斷,也就是可以想成網頁正文通常都是那個網頁裡面看起來最大的區域。

這邊提供最核心、主要的實做方式:

public static BlockProperties parseWebContent(Parser parser) throws ParserException {
NodeList visualBlockNodeList = getVisualBlock(parser);
// Output.printNodeList( visualBlockNodeList );
NodeList linkNodeList = findLinkBlock(visualBlockNodeList);
NodeList invalidNodeList = findInvalidBlock(visualBlockNodeList);
NodeList actionNodeList = findActionBlock(visualBlockNodeList); // NOTE: 因為動作標籤可能沒有包含文字,所以要獨立出來找
Map<String, NodeList> blockNodeMap = new HashMap<String, NodeList>();
blockNodeMap.put(VISUAL_BLOCK, visualBlockNodeList);
blockNodeMap.put(LINK_BLOCK, linkNodeList);
blockNodeMap.put(INVALID_BLOCK, invalidNodeList);
blockNodeMap.put(ACTION_BLOCK, actionNodeList);
List<BlockProperties> blockPropertiesList = getBlockProperties(blockNodeMap);
Map<BlockProperties, Double> propMap = new TreeMap<BlockProperties, Double>();
for (int i = 0; i < blockPropertiesList.size(); i++) {
BlockProperties blockProp = blockPropertiesList.get(i);
// /*
if (blockProp.getProperties().equals(NORMAL_BLOCK) && blockProp.getBlockText().length() <= 0)
continue;
// v1: 0.3
else if ((blockProp.getProperties().equals(LINK_BLOCK) && blockProp.getBlockTextRatio() >= 0.4) ||
blockProp.getSubLinkTextRatio() >= 0.45)
continue;
// v1: 0.4
else if ((blockProp.getProperties().equals(INVALID_BLOCK) && blockProp.getBlockTextRatio() >= 0.65) ||
blockProp.getSubInvalidTextRatio() >= 0.4)
continue;
else if (blockProp.getProperties().equals(ACTION_BLOCK) || blockProp.getSubActionBlock() > 0)
continue;
// */
// blockProp.print();
// 找出正文區塊的計算特徵值公式
double linkInvalidTextLen = blockProp.getSubInvalidTextLength() + blockProp.getSubLinkTextLength();
double normalTextLen = blockProp.getBlockText().length() * (1.0 - blockProp.getBlockTextRatio()) - linkInvalidTextLen;
if (linkInvalidTextLen <= 0)
linkInvalidTextLen = 1.0; // 為了除法,如果等於零要轉成1
double linkInvalidSubBlockNum = blockProp.getSubLinkBlock() + blockProp.getSubInvalidBlock();
double normalSubBlock = (double) (blockProp.getSubBlockNum() - linkInvalidSubBlockNum);
if (linkInvalidSubBlockNum <= 0)
linkInvalidSubBlockNum = 1.0;
double weight = Math.pow(normalTextLen, 5) / (double) blockProp.getBlockText().length();
weight /= Math.pow(10.0, 5);
// 子區块都是連結或是無效區块
if (normalSubBlock == 0 && blockProp.getSubBlockNum() != 0)
weight /= (10.0 * Math.pow(blockProp.getSubBlockNum(), 2));
else if (normalSubBlock != 0 && blockProp.getSubBlockNum() != 0)
weight *= (normalSubBlock / Math.pow(blockProp.getSubBlockNum(), 2));
if (blockProp.getProperties().equals(NORMAL_BLOCK))
weight *= 3.0;
else if (blockProp.getProperties().equals(INVALID_BLOCK))
weight *= 1.2;
else if (blockProp.getProperties().equals(LINK_BLOCK))
weight *= 1.8;
// 由視覺區块的class或id來判斷,包含article和content的字眼可提高權重值
CompositeTag blockTag = (CompositeTag) blockProp.getBlockNode();
String className = blockTag.getAttribute("class");
String idName = blockTag.getAttribute("id");
String checkName = null;
if (idName != null)
checkName = idName.toLowerCase();
else if (className != null)
checkName = className.toLowerCase();
if (checkName != null) {
checkName = checkName.toLowerCase().trim();
if (!(checkName.contains("footer") ||
checkName.contains("header") ||
checkName.contains("counter") ||
checkName.contains("banner")) ||
checkName.contains("widget")) {
// System.out.println( "*Weight=" + weight );
if ((checkName.contains("body") &&
checkName.contains("post")) ||
(checkName.contains("entry") &&
checkName.contains("content")) ||
checkName.contains("innertext") ||
(checkName.contains("content") &&
checkName.contains("article")))
weight *= 1000.0;
else if (checkName.contains("content"))
weight *= 50.0;
else if (checkName.contains("article"))
weight *= 10.0;
else if (checkName.contains("text"))
weight *= 5.0;
// 有id的再加分
if (idName != null)
weight *= 100.0;
// System.out.println( "Weight'=" + weight );
}
}
propMap.put(blockProp, weight);
// System.out.println( "\t*Weight=" + weight );
propMap = MapUtils.sortByValue(propMap, true);
}
int count = 0; // 為了取得第一個BlockProperties用的
BlockProperties contentProp = null;
int commentIndex = 0; // 用來儲存回應的區塊索引,所有在回應以下的區塊都不能成為正文區块
for (BlockProperties prop : propMap.keySet()) {
if (prop.getBlockText().length() <= 0)
continue;
try {
// 用來去掉回應區塊
CompositeTag propNode = (CompositeTag) prop.getBlockNode();
String className = propNode.getAttribute("class");
String idName = propNode.getAttribute("id");
String checkName = null;
if (idName != null)
checkName = idName;
else if (className != null)
checkName = className;
if (checkName != null) {
if (checkName.contains("comment") || checkName.contains("reply")) {
commentIndex = blockPropertiesList.indexOf(prop);
continue;
} else if (containTrimClassID(propNode))
continue;
}
}
catch (NullPointerException e) {
e.printStackTrace();
}
if (commentIndex != 0 && blockPropertiesList.indexOf(prop) > commentIndex)
continue;
if (count == 0) {
// System.out.println( "\n\n--> Wegith= " + propMap.get( prop ) ); prop.print();
contentProp = prop;
count++;
} else
break;
}
// 找到正文區塊還有其子區块,判斷是否包含連結區塊或是特定class, id名稱,然後過濾掉。
Node currentNode = contentProp.getBlockNode();
String contentHtml =
org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toHtml(), "\\s+", " ").toLowerCase().trim();
// currentNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim();
String contentText = WebCrawler.filterSpecialSymbol(
org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim());
// currentNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() );
// System.out.println( contentText.length() );
String checkContentHtml = contentHtml; // check開頭的變數是給迴圈判斷正文區塊的子區塊用的
String checkContentText = contentText; // 因為在迴圈中contentHtml和contentText的字串會變動,所以無法拿來判斷子區塊
Map<String, Integer> trimTextMap = new TreeMap<String, Integer>();
for (int i = visualBlockNodeList.indexOf(contentProp.getBlockNode()) + 1;
i < visualBlockNodeList.size(); i++
) {
CompositeTag nextNode = (CompositeTag) visualBlockNodeList.elementAt(i);
String nextHtml =
org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toHtml(), "\\s+", " ").toLowerCase().trim();
// nextNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim();
String nextText = WebCrawler.filterSpecialSymbol(
org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim());
// nextNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() ).trim();
if (checkContentHtml.contains(nextHtml) &&
checkContentText.contains(nextText)) {
// System.out.println( "*" + nextHtml );
// System.out.println( "\t" + nextText );
// System.out.println( nextText.length() );
if (containTrimClassID(nextNode)) {
// 預防過濾掉整個正文字串
if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) {
// System.out.printf("\t過濾掉:%s\n", nextText );
trimTextMap.put(nextHtml, nextText.length());
}
}
if (linkNodeList.contains(nextNode)) {
int linkTextLen = getLinkTextLength(nextNode);
double linkTextRatio = (double) linkTextLen / (double) nextText.length();
if (linkTextRatio >= 0.65) {
if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) {
trimTextMap.put(nextHtml, nextText.length());
// System.out.println( "\t連結區: " + linkTextRatio );
}
}
}
} else
break;
}
/**
* 不在上面直接過濾而且是由字串長的過濾到短的是因為
* 如果要過濾的字串是很短的話,過濾掉後會影響到長字串的過濾
* ex: 「我今天去了淡水,還有去淡水老街。」
* 先過濾「淡水」 ==> 「我今天去了,還有去老街。」
* 再過濾「淡水老街」 ==> 無法過濾掉「淡水老街」,因為字串剩下「老街」
* 過濾不完全!!
* 但如果先過濾掉「淡水老街」 ==> 「我今天去了淡水,還有去。」
* 在過濾掉「淡水」 ==> 「我今天去了,還有去。」
* 完全過濾!!
*/
trimTextMap = MapUtils.sortByValue(trimTextMap, true);
for (String html : trimTextMap.keySet()) {
// System.out.println( "Filter=" + html );
contentHtml = contentHtml.replace(html, "");
}
contentProp.setBlockHtml(
org.apache.commons.lang3.StringUtils.replacePattern(
org.apache.commons.lang3.StringUtils.replacePattern(contentHtml, HtmlTag.BR_REPLACE_REGEX, " "), HtmlTag.STYLE_REPLACE_REGEX, ""));
return contentProp;
}

Demo

擷取選取區域_00432343

 

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

Blog at WordPress.com.

Up ↑