Java各種ファイル形式の内容を読み込む

14877 ワード
必要なjarバッグもあまり覚えていないで、みんなは探して、直接コードをつけることができます:
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.NumberFormat;

import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

/**
*         
*/
public class ReadFileConverter 
{

public String getContents(String path) throws Exception
{
　　String contents = "";
　　int index = path.lastIndexOf(".");
　　String file_suffix = path.substring(index+1).toLowerCase();
　　if(file_suffix.equalsIgnoreCase("txt")||file_suffix.equalsIgnoreCase("log")){
　　　　contents = this.readTXT(path);
　　}
　　else if(file_suffix.equalsIgnoreCase("xls")){
　　　　contents = this.readXLS(path);
　　}
　　else if(file_suffix.equalsIgnoreCase("xlsx")){
　　　　contents = this.readXLSX(path);
　　}
　　else if(file_suffix.equalsIgnoreCase("doc")){
　　　　contents = this.readDOC(path);
　　}
　　else if(file_suffix.equalsIgnoreCase("docx")){
　　　　contents = this.readDOCX(path);
　　}
　　else if(file_suffix.equalsIgnoreCase("pdf")){
　　　　contents = this.readPDF(path);
　　}
　　return contents;
}


public String readXLS(String file) throws Exception
{
　　StringBuilder content = new StringBuilder();
　　HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file));
　　try{
　　　　for(int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++){
　　　　　　if (null != workbook.getSheetAt(numSheets)){
　　　　　　　　HSSFSheet aSheet = workbook.getSheetAt(numSheets);//     sheet
　　　　　　　　for(int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++){
　　　　　　　　　　if (null != aSheet.getRow(rowNumOfSheet)){
　　　　　　　　　　　　HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //      
　　　　　　　　　　　　for(short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++){
　　　　　　　　　　　　　　if (null != aRow.getCell(cellNumOfRow)){
　　　　　　　　　　　　　　　　HSSFCell aCell = aRow.getCell(cellNumOfRow);//     
　　　　　　　　　　　　　　　　if (this.convertCell(aCell).length() > 0){
　　　　　　　　　　　　　　　　　　content.append(this.convertCell(aCell));
　　　　　　　　　　　　　　　　　}
　　　　　　　　　　　　　　}
　　　　　　　　　　　　　　content.append("
");
　　　　　　　　　　　　}
　　　　　　　　　　}
　　　　　　　　}
　　　　　　}
　　　　}
　　}
　　catch(Exception e){
　　　　content.append("xls         ");
　　}
　　finally{
　　　　if(workbook!=null){
　　　　　　workbook.close();
　　　　}
　　}
　　return content.toString();
}


public String readXLSX(String file) throws Exception
{
　　StringBuilder content = new StringBuilder();
　　XSSFWorkbook workbook = new XSSFWorkbook(file);
　　try{
　　　　for(int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++){
　　　　　　if (null != workbook.getSheetAt(numSheets)){
　　　　　　　　XSSFSheet aSheet = workbook.getSheetAt(numSheets);//     sheet
　　　　　　　　for(int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++){
　　　　　　　　　　if (null != aSheet.getRow(rowNumOfSheet)){
　　　　　　　　　　　　XSSFRow aRow = aSheet.getRow(rowNumOfSheet); //      
　　　　　　　　　　　　for(short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++){
　　　　　　　　　　　　　　if (null != aRow.getCell(cellNumOfRow)){
　　　　　　　　　　　　　　　　XSSFCell aCell = aRow.getCell(cellNumOfRow);//     
　　　　　　　　　　　　　　　　if (this.convertCell(aCell).length() > 0){
　　　　　　　　　　　　　　　　　　content.append(this.convertCell(aCell));
　　　　　　　　　　　　　　　　}
　　　　　　　　　　　　　　}
　　　　　　　　　　　　　　content.append("
");
　　　　　　　　　　　　}
　　　　　　　　　　}
　　　　　　　　}
　　　　　　}
　　　　}
　　}catch(Exception e){
　　　　content.append("xlsx         ");
　　}
　　finally{
　　　　if(workbook!=null){
　　　　　　workbook.close();
　　　　}
　　}
　　return content.toString();
}

public String readTXT(String file) throws Exception
{
　　String contents = "";
　　try{
　　　　String encoding = this.get_charset(new File(file));
　　　　if (encoding.equalsIgnoreCase("GBK")) {
　　　　　　contents = FileUtils.readFileToString(new File(file), "gbk");
　　　　} else {
　　　　　　contents = FileUtils.readFileToString(new File(file), "utf8");
　　　　}
　　}catch(Exception e){
　　　　contents = "txt         ";
　　}
　　return contents;
}

public String readDOC(String file) throws Exception
{
　　String returnStr;
　　WordExtractor wordExtractor = new WordExtractor(new FileInputStream(new File(file)));
　　try{
　　　　returnStr = wordExtractor.getText();
　　}catch(Exception e){
　　　　returnStr="doc         ";
　　}
　　finally{
　　　　if(wordExtractor != null){
　　　　　　wordExtractor.close();
　　　　}
　　}
　　return returnStr;
}


public String readDOCX(String file) throws Exception
{
　　String docx;
　　XWPFWordExtractor xwp= new XWPFWordExtractor(POIXMLDocument.openPackage(file));
　　try{
　　　　docx= xwp.getText();
　　}catch(Exception e){
　　　　docx="docx         ";
　　}
　　finally{
　　　　if(xwp !=null){
　　　　　　xwp.close();
　　　　}
　　}
　　return docx;
}


public String readPDF(String file) throws Exception
{
　　String result = null;
　　FileInputStream is = null;
　　PDDocument document = null;
　　try{
　　　　is = new FileInputStream(file);
　　　　document = PDDocument.load(is);
　　　　PDFTextStripper stripper = new PDFTextStripper();
　　　　result = stripper.getText(document);
　　}catch(Exception e){
　　　　result="pdf         ";
　　}
　　finally{
　　　　if (is != null){
　　　　　　is.close();
　　　　}
　　　　if (document != null){
　　　　　　document.close();
 　　　}
　　}
　　return result;
}

private String get_charset(File file) throws IOException 
{
　　String charset = "GBK";
　　byte[] first3Bytes = new byte[3];
　　BufferedInputStream bis = null;
　　try {
　　　　boolean checked = false;
　　　　bis = new BufferedInputStream(new FileInputStream(file));
　　　　bis.mark(0);
　　　　int read = bis.read(first3Bytes, 0, 3);
　　　　if (read == -1)
　　　　　　return charset;
　　　　if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
　　　　　　charset = "UTF-16LE";
　　　　　　checked = true;
　　　　} else if (first3Bytes[0] == (byte) 0xFE&& first3Bytes[1] == (byte) 0xFF) {
　　　　　　charset = "UTF-16BE";
　　　　　　checked = true;
　　　　} else if (first3Bytes[0] == (byte) 0xEF&& first3Bytes[1] == (byte) 0xBB&& first3Bytes[2] == (byte) 0xBF) {
　　　　　　charset = "UTF-8";
　　　　　　checked = true;
　　　　}
　　　　bis.reset();
　　　　if (!checked) {
　　　　　　// int len = 0;
　　　　　　int loc = 0;
　　　　　　while ((read = bis.read()) != -1) {
　　　　　　　　loc=loc+1;
　　　　　　　　if (read >= 0xF0)
　　　　　　　　　　break;
　　　　　　　　if (0x80 <= read && read <= 0xBF) //     BF   ，   GBK
　　　　　　　　　　break;
　　　　　　　　if (0xC0 <= read && read <= 0xDF) {
　　　　　　　　　　read = bis.read();
　　　　　　　　　　if (0x80 <= read && read <= 0xBF) //     (0xC0 - 0xDF)
　　　　　　　　　　// (0x80
　　　　　　　　　　// - 0xBF),    GB   
　　　　　　　　　　　　continue;
　　　　　　　　　　else
　　　　　　　　　　　　break;
　　　　　　　　} else if (0xE0 <= read && read <= 0xEF) {//       ，      
　　　　　　　　　　read = bis.read();
　　　　　　　　　　if (0x80 <= read && read <= 0xBF) {
　　　　　　　　　　　　read = bis.read();
　　　　　　　　　　　　if (0x80 <= read && read <= 0xBF) {
　　　　　　　　　　　　　　charset = "UTF-8";
　　　　　　　　　　　　　　break;
　　　　　　　　　　　　} else
　　　　　　　　　　　　　　break;
　　　　　　　　　　} else
　　　　　　　　　　　　break;
　　　　　　　　}
　　　　　　}
　　　　}
　　} catch (Exception e) {
　　　　e.printStackTrace();
　　} finally {
　　　　if (bis != null) {
　　　　　　bis.close();
　　　　}
　　}
　　return charset;
}

@SuppressWarnings("deprecation")
private String convertCell(Cell cell) 
{
　　NumberFormat formater = NumberFormat.getInstance();
　　formater.setGroupingUsed(false);
　　String cellValue = "";
　　if (cell == null) {
　　　　return cellValue;
　　}
　　switch (cell.getCellTypeEnum()) {
　　　　case NUMERIC:
　　　　　　cellValue = formater.format(cell.getNumericCellValue());
　　　　　　break;
　　　　case STRING:
　　　　　　cellValue = cell.getStringCellValue();
　　　　　　break;
　　　　case BLANK:
　　　　　　cellValue = cell.getStringCellValue();
　　　　　　break;
　　　　case BOOLEAN:
　　　　　　cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
　　　　　　break;
　　　　case ERROR:
　　　　　　cellValue = String.valueOf(cell.getErrorCellValue());
　　　　　　break;
　　　　default:
　　　　　　cellValue = "";
　　　　}
　　　　return cellValue.trim();
　　}

}
転載先:https://www.cnblogs.com/zwdx/p/7234484.html
AtCoder に登録したら解くべき精選過去問 10 問を Objective-C で解いてみた
Objective-C で AtCoder に参加する際の苦労