ネット爬虫類速成ガイド(四)URL判重
52709 ワード
:ブロンフィルタの な :
redis: URL md5, key
, : private static BloomFilter<String> bloomFilter = new BloomFilter<String>(2X, X);
: ,@author Magnus Skjegstad <[email protected]>
import java.io.Serializable;
import java.nio.charset.Charset;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.BitSet;
import java.util.Collection;
/**
* Implementation of a Bloom-filter, as described here:
* http://en.wikipedia.org/wiki/Bloom_filter
*
* Inspired by the SimpleBloomFilter-class written by Ian Clarke. This
* implementation provides a more evenly distributed Hash-function by using a
* proper digest instead of the Java RNG. Many of the changes were proposed in
* comments in his blog:
* http://blog.locut.us/2008/01/12/a-decent-stand-alone-java
* -bloom-filter-implementation/
*
* @param <E>
* Object type that is to be inserted into the Bloom filter, e.g.
* String or Integer.
* @author Magnus Skjegstad <[email protected]>
*/
public class BloomFilter<E> implements Serializable {
private BitSet bitset;
private int bitSetSize;
private double bitsPerElement;
private int expectedNumberOfFilterElements; // expected (maximum) number of
// elements to be added
private int numberOfAddedElements; // number of elements actually added to
// the Bloom filter
private int k; // number of hash functions
static final Charset charset = Charset.forName("UTF-8"); // encoding used
// for storing
// hash values
// as strings
static final String hashName = "MD5"; // MD5 gives good enough accuracy in
// most circumstances. Change to
// SHA1 if it's needed
static final MessageDigest digestFunction;
static { // The digest method is reused between instances
MessageDigest tmp;
try {
tmp = java.security.MessageDigest.getInstance(hashName);
} catch (NoSuchAlgorithmException e) {
tmp = null;
}
digestFunction = tmp;
}
/**
* Constructs an empty Bloom filter. The total length of the Bloom filter
* will be c*n.
*
* @param c
* is the number of bits used per element.
* @param n
* is the expected number of elements the filter will contain.
* @param k
* is the number of hash functions used.
*/
public BloomFilter(double c, int n, int k) {
this.expectedNumberOfFilterElements = n;
this.k = k;
this.bitsPerElement = c;
this.bitSetSize = (int) Math.ceil(c * n);
numberOfAddedElements = 0;
this.bitset = new BitSet(bitSetSize);
}
/**
* Constructs an empty Bloom filter. The optimal number of hash functions
* (k) is estimated from the total size of the Bloom and the number of
* expected elements.
*
* @param bitSetSize
* defines how many bits should be used in total for the filter.
* @param expectedNumberOElements
* defines the maximum number of elements the filter is expected
* to contain.
*/
public BloomFilter(int bitSetSize, int expectedNumberOElements) {
this(bitSetSize / (double) expectedNumberOElements,
expectedNumberOElements, (int) Math
.round((bitSetSize / (double) expectedNumberOElements)
* Math.log(2.0)));
}
/**
* Constructs an empty Bloom filter with a given false positive probability.
* The number of bits per element and the number of hash functions is
* estimated to match the false positive probability.
*
* @param falsePositiveProbability
* is the desired false positive probability.
* @param expectedNumberOfElements
* is the expected number of elements in the Bloom filter.
*/
public BloomFilter(double falsePositiveProbability,
int expectedNumberOfElements) {
this(Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2)))
/ Math.log(2), // c = k / ln(2)
expectedNumberOfElements, (int) Math.ceil(-(Math
.log(falsePositiveProbability) / Math.log(2)))); // k =
// ceil
// (
// -
// log_2
// (
// false
// prob
// .
// )
// )
}
/**
* Construct a new Bloom filter based on existing Bloom filter data.
*
* @param bitSetSize
* defines how many bits should be used for the filter.
* @param expectedNumberOfFilterElements
* defines the maximum number of elements the filter is expected
* to contain.
* @param actualNumberOfFilterElements
* specifies how many elements have been inserted into the
* <code>filterData</code> BitSet.
* @param filterData
* a BitSet representing an existing Bloom filter.
*/
public BloomFilter(int bitSetSize, int expectedNumberOfFilterElements,
int actualNumberOfFilterElements, BitSet filterData) {
this(bitSetSize, expectedNumberOfFilterElements);
this.bitset = filterData;
this.numberOfAddedElements = actualNumberOfFilterElements;
}
/**
* Generates a digest based on the contents of a String.
*
* @param val
* specifies the input data.
* @param charset
* specifies the encoding of the input data.
* @return digest as long.
*/
public static long createHash(String val, Charset charset) {
return createHash(val.getBytes(charset));
}
/**
* Generates a digest based on the contents of a String.
*
* @param val
* specifies the input data. The encoding is expected to be
* UTF-8.
* @return digest as long.
*/
public static long createHash(String val) {
return createHash(val, charset);
}
/**
* Generates a digest based on the contents of an array of bytes.
*
* @param data
* specifies input data.
* @return digest as long.
*/
public static long createHash(byte[] data) {
long h = 0;
byte[] res;
synchronized (digestFunction) {
res = digestFunction.digest(data);
}
for (int i = 0; i < 4; i++) {
h <<= 8;
h |= ((int) res[i]) & 0xFF;
}
return h;
}
/**
* Compares the contents of two instances to see if they are equal.
*
* @param obj
* is the object to compare to.
* @return True if the contents of the objects are equal.
*/
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final BloomFilter<E> other = (BloomFilter<E>) obj;
if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) {
return false;
}
if (this.k != other.k) {
return false;
}
if (this.bitSetSize != other.bitSetSize) {
return false;
}
if (this.bitset != other.bitset
&& (this.bitset == null || !this.bitset.equals(other.bitset))) {
return false;
}
return true;
}
/**
* Calculates a hash code for this class.
*
* @return hash code representing the contents of an instance of this class.
*/
@Override
public int hashCode() {
int hash = 7;
hash = 61 * hash + (this.bitset != null ? this.bitset.hashCode() : 0);
hash = 61 * hash + this.expectedNumberOfFilterElements;
hash = 61 * hash + this.bitSetSize;
hash = 61 * hash + this.k;
return hash;
}
/**
* Calculates the expected probability of false positives based on the
* number of expected filter elements and the size of the Bloom filter. <br
* /><br /> The value returned by this method is the <i>expected</i> rate of
* false positives, assuming the number of inserted elements equals the
* number of expected elements. If the number of elements in the Bloom
* filter is less than the expected value, the true probability of false
* positives will be lower.
*
* @return expected probability of false positives.
*/
public double expectedFalsePositiveProbability() {
return getFalsePositiveProbability(expectedNumberOfFilterElements);
}
/**
* Calculate the probability of a false positive given the specified number
* of inserted elements.
*
* @param numberOfElements
* number of inserted elements.
* @return probability of a false positive.
*/
public double getFalsePositiveProbability(double numberOfElements) {
// (1 - e^(-k * n / m)) ^ k
return Math.pow((1 - Math.exp(-k * (double) numberOfElements
/ (double) bitSetSize)), k);
}
/**
* Get the current probability of a false positive. The probability is
* calculated from the size of the Bloom filter and the current number of
* elements added to it.
*
* @return probability of false positives.
*/
public double getFalsePositiveProbability() {
return getFalsePositiveProbability(numberOfAddedElements);
}
/**
* Returns the value chosen for K.<br /> <br /> K is the optimal number of
* hash functions based on the size of the Bloom filter and the expected
* number of inserted elements.
*
* @return optimal k.
*/
public int getK() {
return k;
}
/**
* Sets all bits to false in the Bloom filter.
*/
public void clear() {
bitset.clear();
numberOfAddedElements = 0;
}
/**
* Adds an object to the Bloom filter. The output from the object's
* toString() method is used as input to the hash functions.
*
* @param element
* is an element to register in the Bloom filter.
*/
public void add(E element) {
long hash;
String valString = element.toString();
for (int x = 0; x < k; x++) {
hash = createHash(valString + Integer.toString(x));
hash = hash % (long) bitSetSize;
bitset.set(Math.abs((int) hash), true);
}
numberOfAddedElements++;
}
/**
* Adds all elements from a Collection to the Bloom filter.
*
* @param c
* Collection of elements.
*/
public void addAll(Collection<? extends E> c) {
for (E element : c)
add(element);
}
/**
* Returns true if the element could have been inserted into the Bloom
* filter. Use getFalsePositiveProbability() to calculate the probability of
* this being correct.
*
* @param element
* element to check.
* @return true if the element could have been inserted into the Bloom
* filter.
*/
public boolean contains(E element) {
long hash;
String valString = element.toString();
for (int x = 0; x < k; x++) {
hash = createHash(valString + Integer.toString(x));
hash = hash % (long) bitSetSize;
if (!bitset.get(Math.abs((int) hash)))
return false;
}
return true;
}
/**
* Returns true if all the elements of a Collection could have been inserted
* into the Bloom filter. Use getFalsePositiveProbability() to calculate the
* probability of this being correct.
*
* @param c
* elements to check.
* @return true if all the elements in c could have been inserted into the
* Bloom filter.
*/
public boolean containsAll(Collection<? extends E> c) {
for (E element : c)
if (!contains(element))
return false;
return true;
}
/**
* Read a single bit from the Bloom filter.
*
* @param bit
* the bit to read.
* @return true if the bit is set, false if it is not.
*/
public boolean getBit(int bit) {
return bitset.get(bit);
}
/**
* Set a single bit in the Bloom filter.
*
* @param bit
* is the bit to set.
* @param value
* If true, the bit is set. If false, the bit is cleared.
*/
public void setBit(int bit, boolean value) {
bitset.set(bit, value);
}
/**
* Return the bit set used to store the Bloom filter.
*
* @return bit set representing the Bloom filter.
*/
public BitSet getBitSet() {
return bitset;
}
/**
* Returns the number of bits in the Bloom filter. Use count() to retrieve
* the number of inserted elements.
*
* @return the size of the bitset used by the Bloom filter.
*/
public int size() {
return this.bitSetSize;
}
/**
* Returns the number of elements added to the Bloom filter after it was
* constructed or after clear() was called.
*
* @return number of elements added to the Bloom filter.
*/
public int count() {
return this.numberOfAddedElements;
}
/**
* Returns the expected number of elements to be inserted into the filter.
* This value is the same value as the one passed to the constructor.
*
* @return expected number of elements.
*/
public int getExpectedNumberOfElements() {
return expectedNumberOfFilterElements;
}
/**
* Get expected number of bits per element when the Bloom filter is full.
* This value is set by the constructor when the Bloom filter is created.
* See also getBitsPerElement().
*
* @return expected number of bits per element.
*/
public double getExpectedBitsPerElement() {
return this.bitsPerElement;
}
/**
* Get actual number of bits per element based on the number of elements
* that have currently been inserted and the length of the Bloom filter. See
* also getExpectedBitsPerElement().
*
* @return number of bits per element.
*/
public double getBitsPerElement() {
return this.bitSetSize / (double) numberOfAddedElements;
}
}
package com.lietu.show;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Properties;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
//
public class ExtractProduct {
static BloomFilter<String> urlSeen = new BloomFilter<String>(200000, 20000);
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
String url = "http://www.xiu.com/about/classifymap.shtml";
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]"); // href a
for (Element link : links) { //
String linkHref = link.attr("href"); // href , url
if (linkHref.startsWith("http://list.xiu.com/")
|| linkHref.startsWith("http://brand.xiu.com/")) {
linkHref = getListURL(linkHref);
if (urlSeen.contains(linkHref))
continue;
// System.out.println(linkHref); // url
extractList(linkHref);
}
}
}
public static String getListURL(String oldURL){
ParseURL splitURL = new ParseURL(oldURL);
if(splitURL.searchparms==null){
return splitURL.baseURL;
}
//System.out.println(splitURL.baseURL);
String curPage = splitURL.searchparms.get("currentPage");
// http://list.xiu.com/20069.html?currentPage=2
if (curPage != null)
//System.out.println(splitURL.baseURL + "?currentPage=" + curPage);
return splitURL.baseURL + "?currentPage=" + curPage;
else{
//System.out.println(splitURL.baseURL);
return splitURL.baseURL;
}
}
//
// http://list.xiu.com/19757.html
// http://brand.xiu.com/3117.html
public static void extractList(String url) {
System.out.println("List url:" + url);
urlSeen.add(url);
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (Exception e) {
try {
doc = Jsoup.connect(url).get();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
return;
}
}
Elements links = doc.select("a[href]"); // href a
for (Element link : links) { //
String linkHref = link.attr("href"); // href , url
if (linkHref.startsWith("http://list.xiu.com/")
|| linkHref.startsWith("http://brand.xiu.com/")) {
linkHref = getListURL(linkHref);
if (urlSeen.contains(linkHref))
continue;
extractList(linkHref); // url
} else if (linkHref.startsWith("http://item.xiu.com/product/")) { // "http://item.xiu.com/product/0359097.shtml"
if (urlSeen.contains(linkHref))
continue;
getProduct(linkHref);
}
}
}
//
public static void getProduct(String url) {
System.out.println("Product url:" + url);
urlSeen.add(url);
// String url = "http://item.xiu.com/product/0359097.shtml";
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (Exception e) {
try{
doc = Jsoup.connect(url).get();
}
catch (Exception e2) {
try {
doc = Jsoup.connect(url).get();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
return;
}
}
}
Elements links = doc.select("div.p_title"); // href a
String name = links.get(0).childNode(1).childNode(0).toString();
// System.out.println(name);
Element link = doc.select("div.conlist").get(2);
String desc = link.text();
// System.out.println(link.text());
Element img = doc.select("#imgPic").first();
String imgSrc = img.attr("src");
// System.out.println(img.attr("src"));
Element thumb = doc.select("img[onload]").first();
String thumbSrc = thumb.attr("src");
//System.out.println(thumbSrc);
insertDatabase(url,name,desc,imgSrc,thumbSrc);
}
public static void insertDatabase(String url, String name, String desc,
String img,String thumbSrc) {
Connection conn = null;
PreparedStatement stmt2 = null;
try {
Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
Properties p = new Properties();
p.put("charSet", "gb2312");
conn = DriverManager
.getConnection("jdbc:odbc:driver={Microsoft Access Driver (*.mdb)};DBQ=good.mdb",p);
stmt2 = conn
.prepareStatement("insert into good(url,productName,productDesc,img,thumb)values(?,?,?,?,?)");
stmt2.setString(1, url);
stmt2.setString(2, name);
stmt2.setString(3, desc);
stmt2.setString(4, img);
stmt2.setString(5, thumbSrc);
stmt2.executeUpdate();
} catch (Exception ex) {
ex.printStackTrace();
} finally {
try {
if (stmt2 != null)
stmt2.close();
if (conn != null)
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}