หน้าเว็บ

วันอังคารที่ 30 กรกฎาคม พ.ศ. 2556

Extract document content with apache tika : java


maven project
pom.xml  add dependencies
<!-- tika ****************************************************** -->
<dependency>
    <groupId>org.apache.tika</groupId>
    <artifactId>tika-core</artifactId>
    <version>1.4</version>
</dependency>
<dependency>
    <groupId>org.apache.tika</groupId>
    <artifactId>tika-parsers</artifactId>
    <version>1.4</version>
</dependency>
<!-- tika ****************************************************** -->
code
package com.blogspot.na5cent.learning.tika;

import java.io.IOException;
import java.io.InputStream;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;

/**
 * redcrow
 */
public class TikaTest {

    private static final Logger LOG = Logger.getLogger(TikaTest.class.getName());

    public static void main(String[] args) {
        Tika tika = new Tika();
        InputStream pdfInputStream = null;
        try {
            pdfInputStream = TikaTest.class.getResourceAsStream("/APress_ProJavaScriptDesignPatterns.pdf");
            String extractString = tika.parseToString(pdfInputStream);
            LOG.log(Level.SEVERE, extractString);
        } catch (IOException ex) {
            LOG.log(Level.WARNING, null, ex);
        } catch (TikaException ex) {
            LOG.log(Level.WARNING, null, ex);
        } finally {
            if (pdfInputStream != null) {
                try {
                    pdfInputStream.close();
                } catch (IOException ex) {
                    LOG.log(Level.WARNING, null, ex);
                }
            }
        }
    }
}

result


ไม่มีความคิดเห็น:

แสดงความคิดเห็น