/** * redcrow * na5cent.blogspot.com * 04/01/2013 */ package com.blogspot.na5cent.jsflearning; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author Redcrow */ public class HTMLParser { public static final String TITLE_PATTERN = "<title>(.*?)</title>"; public static final String YOUTUBE_SCREEN_SHORT_PATTERN = "<link itemprop=\"thumbnailUrl\" href=\"(.*?)\">"; public static final String IMAGE_PATTERN = "<img[^>]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>"; public static final String WEBCONTENT_PATTERN = "<p.*?>(.*?)</p>"; public static final String URL_PATTERN = "^(https?|ftp|file|//)(://)?[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]"; private String html = ""; private String host; private String protocol; public HTMLParser(String urlText) throws MalformedURLException, IOException { URL url = new URL(urlText); InputStream inputStream = url.openStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); StringBuilder builder = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { builder.append(line); } html = builder.toString(); host = url.getHost(); protocol = url.getProtocol(); } public List<String> get(String pattern, int numberOfResult) { List<String> result = new ArrayList<String>(); Pattern patternCompile = Pattern.compile(pattern); Matcher matcher = patternCompile.matcher(html); int count = 0; while (matcher.find() && (count++ < numberOfResult)) { String text = matcher.group(1); if (IMAGE_PATTERN.equals(pattern)) { if (!isMatch(text, URL_PATTERN)) { text = protocol + "://" + host + text; } } result.add(text.replaceAll("\\<script.*?\\>.*?\\</script\\>", "") .replaceAll("\\<.*?\\>", " ") .replaceAll("<", "") .replaceAll(">", "")); } if (IMAGE_PATTERN.equals(pattern) && "www.youtube.com".equals(host)) { Pattern pc = Pattern.compile(YOUTUBE_SCREEN_SHORT_PATTERN); Matcher mc = pc.matcher(html); while (mc.find()) { result.add(0, mc.group(1)); } } return result; } private boolean isMatch(String s, String pattern) { try { Pattern patt = Pattern.compile(pattern); Matcher matcher = patt.matcher(s); return matcher.matches(); } catch (RuntimeException e) { return false; } } } /* example to use try { HTMLParser parser = new HTMLParser(url); title = parser.get(HTMLParser.TITLE_PATTERN, 1); image = parser.get(HTMLParser.IMAGE_PATTERN, 10); content = parser.get(HTMLParser.WEBCONTENT_PATTERN, 2); } catch (MalformedURLException ex) { Logger.getLogger(HtmlPerserMB.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(HtmlPerserMB.class.getName()).log(Level.SEVERE, null, ex); } */
วันพฤหัสบดีที่ 3 มกราคม พ.ศ. 2556
extract web content html : java
สมัครสมาชิก:
ส่งความคิดเห็น (Atom)
ไม่มีความคิดเห็น:
แสดงความคิดเห็น