/**
* redcrow
* na5cent.blogspot.com
* 04/01/2013
*/
package com.blogspot.na5cent.jsflearning;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author Redcrow
*/
public class HTMLParser {
public static final String TITLE_PATTERN = "<title>(.*?)</title>";
public static final String YOUTUBE_SCREEN_SHORT_PATTERN = "<link itemprop=\"thumbnailUrl\" href=\"(.*?)\">";
public static final String IMAGE_PATTERN = "<img[^>]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>";
public static final String WEBCONTENT_PATTERN = "<p.*?>(.*?)</p>";
public static final String URL_PATTERN = "^(https?|ftp|file|//)(://)?[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
private String html = "";
private String host;
private String protocol;
public HTMLParser(String urlText) throws MalformedURLException, IOException {
URL url = new URL(urlText);
InputStream inputStream = url.openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
StringBuilder builder = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
builder.append(line);
}
html = builder.toString();
host = url.getHost();
protocol = url.getProtocol();
}
public List<String> get(String pattern, int numberOfResult) {
List<String> result = new ArrayList<String>();
Pattern patternCompile = Pattern.compile(pattern);
Matcher matcher = patternCompile.matcher(html);
int count = 0;
while (matcher.find() && (count++ < numberOfResult)) {
String text = matcher.group(1);
if (IMAGE_PATTERN.equals(pattern)) {
if (!isMatch(text, URL_PATTERN)) {
text = protocol + "://" + host + text;
}
}
result.add(text.replaceAll("\\<script.*?\\>.*?\\</script\\>", "")
.replaceAll("\\<.*?\\>", " ")
.replaceAll("<", "")
.replaceAll(">", ""));
}
if (IMAGE_PATTERN.equals(pattern) && "www.youtube.com".equals(host)) {
Pattern pc = Pattern.compile(YOUTUBE_SCREEN_SHORT_PATTERN);
Matcher mc = pc.matcher(html);
while (mc.find()) {
result.add(0, mc.group(1));
}
}
return result;
}
private boolean isMatch(String s, String pattern) {
try {
Pattern patt = Pattern.compile(pattern);
Matcher matcher = patt.matcher(s);
return matcher.matches();
} catch (RuntimeException e) {
return false;
}
}
}
/*
example to use
try {
HTMLParser parser = new HTMLParser(url);
title = parser.get(HTMLParser.TITLE_PATTERN, 1);
image = parser.get(HTMLParser.IMAGE_PATTERN, 10);
content = parser.get(HTMLParser.WEBCONTENT_PATTERN, 2);
} catch (MalformedURLException ex) {
Logger.getLogger(HtmlPerserMB.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(HtmlPerserMB.class.getName()).log(Level.SEVERE, null, ex);
}
*/
วันพฤหัสบดีที่ 3 มกราคม พ.ศ. 2556
extract web content html : java
สมัครสมาชิก:
ส่งความคิดเห็น (Atom)


ไม่มีความคิดเห็น:
แสดงความคิดเห็น