リードコード-MinHashDriverおよび関連

6259 ワード

使用:汎用クラスcounterハッシュ実装
package org.apache.mahout.clustering.minhash;
public final class MinHashDriver extends AbstractJob
Sequence形式の入力
出力はdebugモードのオプションベクトルとテキストフォーマットに従って、ファイルはSequenceとTextフォーマットすることができます

    Class<? extends Writable> outputClass = 
        debugOutput ? VectorWritable.class : Text.class;
    Class<? extends OutputFormat> outputFormatClass = 
        debugOutput ? SequenceFileOutputFormat.class : TextOutputFormat.class;

    job.setMapperClass(MinHashMapper.class);
    job.setReducerClass(MinHashReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(outputFormatClass);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(outputClass);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputClass);

package org.apache.mahout.clustering.minhash;
public class MinHashMapper extends Mapper
setup関数で
タイプと数に基づいてhash関数グループを生成する

hashFunction = HashFactory.createHashFunctions(hashType, numHashFunctions);

map関数
ハッシュ関数ごとに
関数をitemの各featureに適用し、feature値を4バイトに変換し、最小のハッシュ値をとる

    for (int i = 0; i < numHashFunctions; i++) {
      for (Vector.Element ele : featureVector) {
        int value = (int) ele.get();
        bytesToHash[0] = (byte) (value >> 24);
        bytesToHash[1] = (byte) (value >> 16);
        bytesToHash[2] = (byte) (value >> 8);
        bytesToHash[3] = (byte) value;
        int hashIndex = hashFunction[i].hash(bytesToHash);
        if (minHashValues[i] > hashIndex) {
          minHashValues[i] = hashIndex;
        }
      }
    }

クラスタリングidの組合せと配布
keyGroup制御idの構成セグメント数
形式はXXX-XXX-XXX形式
各itemはハッシュ関数ごとに1回配布されます

    for (int i = 0; i < numHashFunctions; i++) {
      StringBuilder clusterIdBuilder = new StringBuilder();
      for (int j = 0; j < keyGroups; j++) {
        clusterIdBuilder.append(minHashValues[(i + j) % numHashFunctions]).append('-');
      }
      String clusterId = clusterIdBuilder.toString();
      clusterId = clusterId.substring(0, clusterId.lastIndexOf('-'));
      Text cluster = new Text(clusterId);
      Writable point;
      if (debugOutput) {
        point = new VectorWritable(featureVector.clone());
      } else {
        point = new Text(item.toString());
      }
      context.write(cluster, point);
    }

package org.apache.mahout.clustering.minhash;
public class MinHashReducer extends Reducer
reduce関数
Debugタイプに応じて異なるタイプを解析

    Collection<Writable> pointList = new ArrayList<Writable>();
    for (Writable point : points) {
      if (debugOutput) {
        Vector pointVector = ((VectorWritable) point).get().clone();
        Writable writablePointVector = new VectorWritable(pointVector);
        pointList.add(writablePointVector);
      } else {
        Writable pointText = new Text(point.toString());
        pointList.add(pointText);
      }
    }

counterはenumタイプのパラメータを使用します
最小クラスタ数未満の数の廃棄

    if (pointList.size() >= minClusterSize) {
      context.getCounter(Clusters.ACCEPTED).increment(1);
      for (Writable point : pointList) {
        context.write(cluster, point);
      }
    } else {
      context.getCounter(Clusters.DISCARDED).increment(1);
    }

package org.apache.mahout.clustering.minhash;
public final class HashFactory
ハッシュの実装
3つのハッシュ・タイプ

  public enum HashType {
    LINEAR, POLYNOMIAL, MURMUR
  }

双晶素数、差が2の2つの数が素数の場合
整数範囲内の最大双晶素数の小さい値
RandomUtils.MAX_INT_SMALLER_TWIN_PRIME = 2147482949
ハッシュは素数で型を取る衝突が小さい
線形ハッシュ

  static class LinearHash implements HashFunction {
    private final int seedA;
    private final int seedB;

    LinearHash(int seedA, int seedB) {
      this.seedA = seedA;
      this.seedB = seedB;
    }

    @Override
    public int hash(byte[] bytes) {
      long hashValue = 31;
      for (long byteVal : bytes) {
        hashValue *= seedA * byteVal;
        hashValue += seedB;
      }
      return Math.abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
    }
  }

多項式ハッシュ

  static class PolynomialHash implements HashFunction {
    private final int seedA;
    private final int seedB;
    private final int seedC;

    PolynomialHash(int seedA, int seedB, int seedC) {
      this.seedA = seedA;
      this.seedB = seedB;
      this.seedC = seedC;
    }

    @Override
    public int hash(byte[] bytes) {
      long hashValue = 31;
      for (long byteVal : bytes) {
        hashValue *= seedA * (byteVal >> 4);
        hashValue += seedB * byteVal + seedC;
      }
      return Math
          .abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
    }
  }

MurMurハッシュ

  static class MurmurHashWrapper implements HashFunction {
    private final int seed;

    MurmurHashWrapper(int seed) {
      this.seed = seed;
    }

    @Override
    public int hash(byte[] bytes) {
      long hashValue = MurmurHash.hash64A(bytes, seed);
      return Math.abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
    }
  }