spark javaプログラミング練習


今日はspark javaの一般的な操作を練習しました.
SparkConf conf=new SparkConf();
conf.setAppName("xxxxxx");
conf.setMaster("local");

JavaSparkContext sc=new JavaSparkContext(conf);
sc.setLogLevel("error");

JavaRDD rdd1=sc.textFile("test.txt");
System.out.println("rdd1:"+rdd1);

    rdd1:[a 1, b 2, a 3, b 4, c 3, b 9]

JavaRDD> rdd2=rdd1.map(w->Arrays.asList(w.split(" ")));
System.out.println("rdd2:"+rdd2.collect());

   :rdd2:[[a, 1], [b, 2], [a, 3], [b, 4], [c, 3], [b, 9]]

JavaPairRDDpairRdd1=rdd1.mapToPair(new PairFunction(){
			
			public Tuple2 call(String t) throws Exception{
				String[]st=t.split(" ");
				return new Tuple2(st[0],st[1]);
			}
		});
System.out.println("pairRdd1:"+pairRdd1.collect());

   :pairRdd1:[(a,1), (b,2), (a,3), (b,4), (c,3), (b,9)]

maptopair()  
JavaPairRDDpairRdd2=rdd1.mapToPair(new PairFunction(){
			
			public Tuple2 call(String t) throws Exception{
				String[]st=t.split(" ");
				return new Tuple2(st[1],st[0]);
			}
		});
System.out.println("pairRdd2:"+pairRdd2.collect());
   :pairRdd2:[(1,a), (2,b), (3,a), (4,b), (3,c), (9,b)]

groupByKey()  1
 
  
JavaPairRDD> pairRdd3=pairRdd1.groupByKey();
System.out.println("pairRdd3:"+pairRdd3.collect());

   :pairRdd3:[(a,[1, 3]), (b,[2, 4, 9]), (c,[3])]

groupByKey()  2
JavaPairRDD> pairRdd4=pairRdd2.groupByKey();
System.out.println("pairRdd4:"+pairRdd4.collect());

   :pairRdd4:[(4,[b]), (2,[b]), (9,[b]), (3,[a, c]), (1,[a])]

keyBy()  
JavaPairRDD> pairRdd5=rdd2.keyBy(new Function,String>() {
			public String call(List s1) throws Exception{
				return s1.get(1);
			}
			
		});
System.out.println("pairRdd5:"+pairRdd5.collect());

   :pairRdd5:[(1,[a, 1]), (2,[b, 2]), (3,[a, 3]), (4,[b, 4]), (3,[c, 3]), (9,[b, 9])]



       spark java  
  :java8  lambda  ,