Java各種ファイル形式の内容を読み込む

14877 ワード

必要なjarバッグもあまり覚えていないで、みんなは探して、直接コードをつけることができます:
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.NumberFormat;

import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

/**
*         
*/
public class ReadFileConverter 
{

public String getContents(String path) throws Exception
{
  String contents = "";
  int index = path.lastIndexOf(".");
  String file_suffix = path.substring(index+1).toLowerCase();
  if(file_suffix.equalsIgnoreCase("txt")||file_suffix.equalsIgnoreCase("log")){
    contents = this.readTXT(path);
  }
  else if(file_suffix.equalsIgnoreCase("xls")){
    contents = this.readXLS(path);
  }
  else if(file_suffix.equalsIgnoreCase("xlsx")){
    contents = this.readXLSX(path);
  }
  else if(file_suffix.equalsIgnoreCase("doc")){
    contents = this.readDOC(path);
  }
  else if(file_suffix.equalsIgnoreCase("docx")){
    contents = this.readDOCX(path);
  }
  else if(file_suffix.equalsIgnoreCase("pdf")){
    contents = this.readPDF(path);
  }
  return contents;
}


public String readXLS(String file) throws Exception
{
  StringBuilder content = new StringBuilder();
  HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file));
  try{
    for(int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++){
      if (null != workbook.getSheetAt(numSheets)){
        HSSFSheet aSheet = workbook.getSheetAt(numSheets);//     sheet
        for(int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++){
          if (null != aSheet.getRow(rowNumOfSheet)){
            HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //      
            for(short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++){
              if (null != aRow.getCell(cellNumOfRow)){
                HSSFCell aCell = aRow.getCell(cellNumOfRow);//     
                if (this.convertCell(aCell).length() > 0){
                  content.append(this.convertCell(aCell));
                 }
              }
              content.append("
");             }           }         }       }     }   }   catch(Exception e){     content.append("xls ");   }   finally{     if(workbook!=null){       workbook.close();     }   }   return content.toString(); } public String readXLSX(String file) throws Exception {   StringBuilder content = new StringBuilder();   XSSFWorkbook workbook = new XSSFWorkbook(file);   try{     for(int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++){       if (null != workbook.getSheetAt(numSheets)){         XSSFSheet aSheet = workbook.getSheetAt(numSheets);// sheet         for(int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++){           if (null != aSheet.getRow(rowNumOfSheet)){             XSSFRow aRow = aSheet.getRow(rowNumOfSheet); //             for(short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++){               if (null != aRow.getCell(cellNumOfRow)){                 XSSFCell aCell = aRow.getCell(cellNumOfRow);//                 if (this.convertCell(aCell).length() > 0){                   content.append(this.convertCell(aCell));                 }               }               content.append("
");             }           }         }       }     }   }catch(Exception e){     content.append("xlsx ");   }   finally{     if(workbook!=null){       workbook.close();     }   }   return content.toString(); } public String readTXT(String file) throws Exception {   String contents = "";   try{     String encoding = this.get_charset(new File(file));     if (encoding.equalsIgnoreCase("GBK")) {       contents = FileUtils.readFileToString(new File(file), "gbk");     } else {       contents = FileUtils.readFileToString(new File(file), "utf8");     }   }catch(Exception e){     contents = "txt ";   }   return contents; } public String readDOC(String file) throws Exception {   String returnStr;   WordExtractor wordExtractor = new WordExtractor(new FileInputStream(new File(file)));   try{     returnStr = wordExtractor.getText();   }catch(Exception e){     returnStr="doc ";   }   finally{     if(wordExtractor != null){       wordExtractor.close();     }   }   return returnStr; } public String readDOCX(String file) throws Exception {   String docx;   XWPFWordExtractor xwp= new XWPFWordExtractor(POIXMLDocument.openPackage(file));   try{     docx= xwp.getText();   }catch(Exception e){     docx="docx ";   }   finally{     if(xwp !=null){       xwp.close();     }   }   return docx; } public String readPDF(String file) throws Exception {   String result = null;   FileInputStream is = null;   PDDocument document = null;   try{     is = new FileInputStream(file);     document = PDDocument.load(is);     PDFTextStripper stripper = new PDFTextStripper();     result = stripper.getText(document);   }catch(Exception e){     result="pdf ";   }   finally{     if (is != null){       is.close();     }     if (document != null){       document.close();    }   }   return result; } private String get_charset(File file) throws IOException {   String charset = "GBK";   byte[] first3Bytes = new byte[3];   BufferedInputStream bis = null;   try {     boolean checked = false;     bis = new BufferedInputStream(new FileInputStream(file));     bis.mark(0);     int read = bis.read(first3Bytes, 0, 3);     if (read == -1)       return charset;     if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {       charset = "UTF-16LE";       checked = true;     } else if (first3Bytes[0] == (byte) 0xFE&& first3Bytes[1] == (byte) 0xFF) {       charset = "UTF-16BE";       checked = true;     } else if (first3Bytes[0] == (byte) 0xEF&& first3Bytes[1] == (byte) 0xBB&& first3Bytes[2] == (byte) 0xBF) {       charset = "UTF-8";       checked = true;     }     bis.reset();     if (!checked) {       // int len = 0;       int loc = 0;       while ((read = bis.read()) != -1) {         loc=loc+1;         if (read >= 0xF0)           break;         if (0x80 <= read && read <= 0xBF) // BF , GBK           break;         if (0xC0 <= read && read <= 0xDF) {           read = bis.read();           if (0x80 <= read && read <= 0xBF) // (0xC0 - 0xDF)           // (0x80           // - 0xBF), GB             continue;           else             break;         } else if (0xE0 <= read && read <= 0xEF) {// ,           read = bis.read();           if (0x80 <= read && read <= 0xBF) {             read = bis.read();             if (0x80 <= read && read <= 0xBF) {               charset = "UTF-8";               break;             } else               break;           } else             break;         }       }     }   } catch (Exception e) {     e.printStackTrace();   } finally {     if (bis != null) {       bis.close();     }   }   return charset; } @SuppressWarnings("deprecation") private String convertCell(Cell cell) {   NumberFormat formater = NumberFormat.getInstance();   formater.setGroupingUsed(false);   String cellValue = "";   if (cell == null) {     return cellValue;   }   switch (cell.getCellTypeEnum()) {     case NUMERIC:       cellValue = formater.format(cell.getNumericCellValue());       break;     case STRING:       cellValue = cell.getStringCellValue();       break;     case BLANK:       cellValue = cell.getStringCellValue();       break;     case BOOLEAN:       cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();       break;     case ERROR:       cellValue = String.valueOf(cell.getErrorCellValue());       break;     default:       cellValue = "";     }     return cellValue.trim();   } }

 
転載先:https://www.cnblogs.com/zwdx/p/7234484.html