HDFSにおけるPathFilterクラスによるパスのフィルタリング

2048 ワード

1、定義クラス実現PathFilterインタフェース
package com.ru.hadoop.wordcount;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;

/**
 *  
 * @author nange
 *
 */
public class MyFilePathFileter implements PathFilter{
	// fileName 
	private String fileName;
	
	public MyFilePathFileter(String fileName){
		this.fileName = fileName;
	}

	/**
	 * @param path :   :hdfs://localhost:9000/hdfs/test/wordcount/in/word.txt
	 */
	@Override
	public boolean accept(Path path) {
		boolean res = false;
		if(path.toString().indexOf(fileName) != -1){
			res = true;
		}
		System.out.println("path = " + path + " :" + res);
		return res;
	}

}

2、FileSystema提供globStatus()方法でファイルパスをフィルタする
/**
	 *  
	 * FileSystema globStatus() , hdfs 
	 * 
	 * @param in :    :hdfs://localhost:9000/hdfs/test/wordcount/in/*
	 * @throws IOException 
	 */
	public String filePaths(String in) throws IOException{
		StringBuilder sb = new StringBuilder();
		//globStatus() FileStatus , 。
		FileStatus[] fss = fs.globStatus(new Path(in), new MyFilePathFileter("in/word"));
		Path[] paths = FileUtil.stat2Paths(fss);
		if(paths != null){
			for(Path path : paths){
				sb.append(path.toString() + ",");
			}
		}
		int index = sb.toString().lastIndexOf(",");
 		if(index != -1){
 			System.out.println(" :" + sb.toString().substring(0, index));
 			return sb.toString().substring(0, index);
		}
		
		return null;
	}

3、ジョブマルチパス入力
fileInPaths: "," . :hdfs://localhost:9000/hdfs/test/wordcount/in/word.txt,hdfs://localhost:9000/hdfs/test/wordcount/in/word2.txt

FileInputFormat.addInputPaths(job, fileInPaths);//