FileReadUtil.java

package com.cesgroup.bdc.util;

import cn.hutool.core.io.FileTypeUtil;
import cn.hutool.core.io.IoUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.poi.excel.ExcelReader;
import cn.hutool.poi.excel.ExcelUtil;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;

public class FileReadUtil {
  /**
   * 读取文件文本内容
   *
   * @param file
   * @return
   * @throws Exception
   */
  public static String readText(File file) throws Exception {
    String text = null;
    String fileType = "";
    String path = file.getPath();
    int c = path.lastIndexOf('.');
    if (c > 0 && c < path.length()) {
      fileType = path.substring(c + 1);
    } else {
      fileType = FileTypeUtil.getType(file);
    }
    if ("pdf".equalsIgnoreCase(fileType)) {
      text = pdf2String(file);
    } else if ("doc".equalsIgnoreCase(fileType) || "docx".equalsIgnoreCase(fileType)) {
      text = word2String(file);
    } else if ("xls".equalsIgnoreCase(fileType) || "xlsx".equalsIgnoreCase(fileType)) {
      text = excel2String(file);
    } else if ("txt".equalsIgnoreCase(fileType)) {
      text = txt2String(file);
    } else {
      throw new RuntimeException("不支持的文件类型：" + fileType);
    }
    return text;
  }

  /**
   * 读取txt文件的内容
   *
   * @param file 想要读取的txt文件对象
   * @return 返回文件内容
   * @author: shen.shaohua
   * @since: 2019/7/17 14:26
   */
  public static String txt2String(File file) throws IOException {
    StringBuilder text = new StringBuilder();
    BufferedReader br = null;
    try {
      InputStream is = new FileInputStream(file);
      String charsetName = getFilecharset(file);
      br = new BufferedReader(new InputStreamReader(is, charsetName));//构造一个BufferedReader类来读取文件
      String s = null;
      int i = 0;
      while ((s = br.readLine()) != null) { //使用readLine方法，一次读一行
        if (i > 0) {
          text.append("\n");
        }
        text.append(s);
        i++;
      }
    } finally {
      br.close();
    }
    return text.toString();
  }

  /**
   * 判断编码格式方法
   *
   * @param sourceFile
   * @return
   */
  private static String getFilecharset(File sourceFile) {
    String charset = "GBK";
    byte[] first3Bytes = new byte[3];
    try {
      boolean checked = false;
      BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));
      bis.mark(0);
      int read = bis.read(first3Bytes, 0, 3);
      if (read == -1) {
        return charset; //文件编码为 ANSI
      } else if (first3Bytes[0] == (byte) 0xFF
        && first3Bytes[1] == (byte) 0xFE) {
        charset = "UTF-16LE"; //文件编码为 Unicode
        checked = true;
      } else if (first3Bytes[0] == (byte) 0xFE
        && first3Bytes[1] == (byte) 0xFF) {
        charset = "UTF-16BE"; //文件编码为 Unicode big endian
        checked = true;
      } else if (first3Bytes[0] == (byte) 0xEF
        && first3Bytes[1] == (byte) 0xBB
        && first3Bytes[2] == (byte) 0xBF) {
        charset = "UTF-8"; //文件编码为 UTF-8
        checked = true;
      }
      bis.reset();
      if (!checked) {
        int loc = 0;
        while ((read = bis.read()) != -1) {
          loc++;
          if (read >= 0xF0)
            break;
          if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的，也算是GBK
            break;
          if (0xC0 <= read && read <= 0xDF) {
            read = bis.read();
            if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
              // (0x80 - 0xBF),也可能在GB编码内
              continue;
            else
              break;
          } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错，但是几率较小
            read = bis.read();
            if (0x80 <= read && read <= 0xBF) {
              read = bis.read();
              if (0x80 <= read && read <= 0xBF) {
                charset = "UTF-8";
                break;
              } else
                break;
            } else
              break;
          }
        }
      }
      bis.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
    //System.out.println(charset);
    return charset;
  }

  /**
   * 读取 pdf 文件文本内容
   *
   * @param file pdf 文件
   * @return
   * @throws IOException
   * @author: shen.shaohua
   * @since: 2019/7/17 13:35
   */
  public static String pdf2String(File file) throws IOException {
    PDDocument document = PDDocument.load(file);
    PDFTextStripper stripper = new PDFTextStripper();
    //stripper.setSortByPosition(false);
    String text = stripper.getText(document);
    document.close();
    return text;
  }

  /**
   * 读取 word 文件文本内容
   *
   * @param file word 文件
   * @return
   * @throws Exception
   * @author: shen.shaohua
   * @since: 2019/7/17 14:00
   */
  public static String word2String(File file) throws Exception {
    String text = null;
    POITextExtractor extractor = null;
    try {
      String path = file.getPath();
      if (path.toLowerCase().endsWith(".doc")) {
        InputStream is = new FileInputStream(file);
        extractor = new WordExtractor(is);
        text = extractor.getText();
        is.close();
        extractor.close();
      } else if (path.toLowerCase().endsWith(".docx")) {
        //OPCPackage opcPackage = POIXMLDocument.openPackage(path);
        OPCPackage opcPackage = OPCPackage.open(file);
        extractor = new XWPFWordExtractor(opcPackage);
        text = extractor.getText();
        //opcPackage.close();
        extractor.close();
      } else {
        throw new RuntimeException("此文件不是word文件！");
      }
    } finally {
      IoUtil.close(extractor);
    }
    return text;
  }

  /**
   * 读取 excel 文件文本内容
   *
   * @param file
   * @return
   * @author: shen.shaohua
   * @since: 2019/10/25 15:00
   */
  public static String excel2String(File file) {
    boolean hasNext = true;
    StringBuilder sb = new StringBuilder();
    int i = 0;
    while (hasNext) {
      try {
        ExcelReader excelReader = ExcelUtil.getReader(file, i);
        if (excelReader == null) {
          break;
        }
        List<List<Object>> rowList = excelReader.read();
        int j = 0;
        for (List<Object> cellList : rowList) {
          if (j > 0) sb.append("\n");
          int k = 0;
          for (Object cell : cellList) {
            if (cell == null || StrUtil.isBlank(cell.toString())) {
              continue;
            }
            if (k > 0) sb.append("  ");
            sb.append(cell);
            k++;
          }
          j++;
        }
        sb.append("\n");
      } catch (Exception e) {
        hasNext = false;
        //e.printStackTrace();
      }
      i++;
    }
    return sb.toString();
  }

  public static void main(String[] args) throws Exception {
//		File file = new File("C:\\Users\\Administrator\\Desktop\\崇明机管局\\aa.txt");
    File file = new File("F:\\小说\\宾克的魔法.txt");
    String text = FileReadUtil.readText(file);
    System.out.println("@@@");
    System.out.println(text);
    System.out.println("@@@");
  }
}