C〓〓マルチスレッドウェブ採集器(Spider)
これはC〓〓言語の編纂のマルチスレッドのホームページの自動採集のプログラムです。メインクラスのコードを示します。完全コードはここでダウンロードしてください。
回転:http://www.cnblogs.com/closetome/articles/1711764.html
回転:http://www.cnblogs.com/closetome/articles/1711764.html
/**
:
--------------------------------------------
-> -> -> -> ->
( )
sql ( )
c#
UC1:
----------------------------------------------
: WSE
:
:
:
—— : , URL, , URL, , URL
—— :
:
( ): , URL
( ):
1. URL, ( , , ),
2.
3. ( )
4. URL( )
5. URL ( )
PageInfo
----------------------------
| id
| createdTime
| modifiedTime
| createdUser
| modifiedUser
|
| URL
128 MD5 | UrlMD5
IP | IP
| content
| type
|
-------------------------------
: UrlMD5
|-------------------------------------------------------------------------------------------------------|
|Spider |
| |---------------------------------------------------------------------| |----------------| |
| |--| | ( ) ------------------------------|--->| UrlMD5 | |
| | | | |---------| |--------| |---|------| |--------| | |----------------| |
| | |---|---|->| URL |--->| |--->| | | |---->| |---| | |
| | | | | |---------| |--------| |------|---| |--------| | | |
| | | | | | | | |
| |--| | |--------------------------------------|----------------------| | |----------------| |
| |---| | |
| | |---------------------------------------------------------------------| |--------|-------| |
| | | |
| | |---------| | |
| |---| | |
| |---------| |
| |
|-------------------------------------------------------------------------------------------------------|
-------------------------------
SpiderHandler --
Spider
PageInfo --
Gatherer --
Analyser --url
IStorage
ISpiderUI
ILogger
*/
//==================================================================
//
// C# 。
// , , 。
//
// Copyright(C) 2009 themz.cn All rights reserved
// author: xml
// email: [email protected]
// blog: http://programmingcanruinyourlife.themz.cn/
// since: .net2.0
// version: 1.0
// created: 2009-08-06
// modified: 2009-10-10
//
// : , 。
// 。
// , 。
// !
//
//==================================================================
// :
// 1. .cs
// 2. .net2.0 exe ,
// 3. addSeeds , : addSeeds http://url/
// 4. start
// 5. getContents
// 6. pause , start
// 7. stop
// 8. exit
//
//
using System;
using System.Collections.Generic;
using System.Data;
using System.Data.Common;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
//using System.Configuration;
//using System.Diagnostics;
//[Serializable()]
namespace My.WSE.Spider
{
#region
/**
/
*/
public interface IThread
{
//T Queue{ get; }
//List Threads{ get; }
//
//void Enqueue( T t );
//T Dequeue();
Thread AddThread();
void RemoveThread();
void RequestThreadPause();
void RequestThreadPause( bool pauseOrContinue );
void RequestThreadStop();
}
#endregion
#region
//
public interface IGatherer
{
void Download( ref PageInfo info,string contentType,int timeout );
void Download( ref PageInfo info,int timeout );
}
//
public interface IStorage
{
List GetIndexeds(); // URL MD5
List GetSeeds();
int AddSeed( SeedInfo info );
void RemoveSeed( SeedInfo info );
void SaveContents( List info ); //
}
//
public interface ILogger
{
void Write( string content );
string Read( string filename );
string ToString( Exception ex );
}
#endregion
#region
public class ContentTypeException : Exception
{
public ContentTypeException( string message ) : base( message ){}
}
public class ContentSizeException : Exception
{
public ContentSizeException( string message ) : base( message ){}
}
public class NotOnlyException : Exception
{
public NotOnlyException( string message ) : base( message ){}
}
public class KeyHasExistsException : Exception
{
public KeyHasExistsException( string message ) : base( message ){}
}
#endregion
#region PageInfo
public class PageQueue
{
// 1
public PageQueue()
{
_queue = new LinkedList();
}
// 2
public PageQueue( ref LinkedList queue ) : this()
{
if( null != queue ){
_queue = queue;
}
}
#region
public int Count
{
get{ return _queue.Count; }
}
public bool Contains( PageInfo info )
{
return _queue.Contains( info.UrlMD5 );
}
public void Enqueue( PageInfo info ) // AddLast
{
AddLast( info );
}
public PageInfo Dequeue() // RemoveFirst
{
return RemoveFirst();
}
public void AddFirst( PageInfo info )
{
lock( _queue ){
_queue.AddFirst( info.UrlMD5 );
AddData( info );
Monitor.Pulse( _queue );
}
}
public void AddLast( PageInfo info )
{
lock( _queue ){
_queue.AddLast( info.UrlMD5 );
AddData( info );
Monitor.Pulse( _queue );
}
}
public PageInfo RemoveFirst()
{
PageInfo info = null;
lock( _queue ){
LinkedListNode node = _queue.First;
if( null == node ){
Monitor.Wait( _queue );
node = _queue.First;
}
string key = node.Value;
_queue.RemoveFirst();
info = GetData(key);
RemoveData(key); //
}
return info;
}
public PageInfo RemoveLast()
{
PageInfo info = null;
lock( _queue ){
LinkedListNode node = _queue.First;
if( null == node ){
Monitor.Wait( _queue );
}
else{
string key = node.Value;
_queue.RemoveFirst();
info = GetData(key);
RemoveData(key); //
}
}
return info;
}
public PageInfo Remove( PageInfo info )
{
lock( _queue ){
if( _queue.Remove(info.UrlMD5) ){
info = GetData(info.UrlMD5);
RemoveData(info.UrlMD5); //
}
else{
info = null;
}
}
return info;
}
public Dictionary ToDictionary()
{
Dictionary dict = new Dictionary();
lock( _queue ){
LinkedListNode node = _queue.First;
while( null != node ){
dict[node.Value] = GetData(node.Value);
node = node.Next;
}
}
return dict;
}
#endregion
#region
public PageInfo GetData( string key )
{
lock( _s_pages ){
if( _s_pages.ContainsKey(key) ){
return _s_pages[key];
}else{
_log.Enqueue( string.Format( "wse.spider.cs GetData,Dictionary {0} ",key) );
return null;
}
}
}
public void AddData( PageInfo info )
{
lock( _s_pages ){
_s_pages[info.UrlMD5] = info;
}
}
public void RemoveData( string key )
{
lock( _s_pages ){
if( _s_pages.ContainsKey(key) ){
_s_pages.Remove(key);
}
}
}
public bool ContainsData( PageInfo info )
{
return _s_pages.ContainsKey(info.UrlMD5);
}
#endregion
#region Private Members
private LinkedList _queue = null;
private static Dictionary _s_pages = new Dictionary();
private EventLogger _log = new EventLogger();
#endregion
}
#endregion
#region
public class PageGatherer : IThread
{
#region
// 1
public PageGatherer(){}
// 2
public PageGatherer( IGatherer gather )
{
_log = new EventLogger();
_store = new PageStorage();
_gather = gather;
_queue = new PageQueue(); //
_threads = new List(); //
_shouldPause = new ManualResetEvent(true);
_shouldStop = false;
}
#endregion
#region Public Property
//
public Dictionary IndexedPool
{
get{ return _s_indexedPool; }
}
public PageQueue SeedQueue
{
get{ return _s_seedQueue; }
}
//
public PageQueue Queue
{
get{ return _queue; }
}
public List Threads
{
get{ return _threads; }
}
//
public int ThreadCount
{
get{ return _threadCount; }
}
#endregion
#region (Thread Method)
//
public Thread AddThread()
{
Thread t = new Thread( new ThreadStart(ThreadRun) );
t.IsBackground = true;
t.Start();
_threads.Add(t);
_threadCount++;
return t;
}
//
public void RemoveThread()
{
//
}
//
public void RequestThreadPause()
{
}
//
public void RequestThreadPause( bool pauseOrContinue )
{
if( !pauseOrContinue ){
_shouldPause.Set();
}else{
_shouldPause.Reset();
}
}
//
public void RequestThreadStop()
{
_shouldStop = true;
}
#endregion
#region Private Methods
//
private void ThreadRun()
{
PageInfo info = null;
// : URL-> -> -> ->|URL-> ....
while( !_shouldStop )
{
_shouldPause.WaitOne(); //
if( _queue.Count < 1 ){
_queue.Enqueue( _s_seedQueue.Dequeue() ); //
}
info = _queue.Dequeue();
if( null == info ){ continue; }
//1
string url = info.URL;
try{
_gather.Download(ref info,"text/html",90000);
}
catch( Exception ex ){
_log.Enqueue( info.URL + " " + ex.ToString() );
continue;
}
//2 url _s_indexedPool
AddIndexed( info.UrlMD5 );
//3 : _dataPool
_store.Queue.Enqueue( info );
//4 : queue
AnalyzeToQueue( info, ref _queue );
}
}
// url,
private void AnalyzeToQueue( PageInfo info, ref PageQueue queue )
{
PageQueue _queue = queue;
List urls = Analyzer.ParseToURLs(info);
PageInfo newInfo = null;
for( int i=0,len=urls.Count; i _threads; //
private ManualResetEvent _shouldPause; //
private bool _shouldStop; //
private static Dictionary _s_indexedPool = new Dictionary(); // URL
private static PageQueue _s_seedQueue = new PageQueue(); //
private static int _threadCount = 0; //
#endregion
}
#endregion
#region
public class PageStorage : IThread
{
#region
// 1
public PageStorage(){}
// 2
public PageStorage( IStorage store )
{
_log = new EventLogger();
_store = store;
_shouldStop = false;
}
#endregion
#region Public Property
//
public PageQueue Queue
{
get{ return _s_queue; }
}
//
public List Threads
{
get{ return _threads; }
}
#endregion
#region (Thread Method)
//
public Thread AddThread()
{
Thread t = new Thread( new ThreadStart(ThreadRun) );
t.IsBackground = true;
t.Start();
return t;
}
//
public void RemoveThread()
{
//
}
//
public void RequestThreadPause()
{
//
}
//
public void RequestThreadPause( bool pauseOrContinue )
{
//
}
//
public void RequestThreadStop()
{
_shouldStop = true;
}
#endregion
#region Private Methods
//
private void ThreadRun()
{
if( null == _store ){ return; }
int count = 10;
List infos = null;
while( !_shouldStop )
{
infos = DequeueSome( count );
try{
_store.SaveContents( infos );
}
catch( Exception ex ){
_log.Enqueue( ex.ToString() );
}
}
}
//
private List DequeueSome( int count )
{
List infos = new List();
for( int i=0; i _threads = new List(); //
private bool _shouldStop;
#endregion
}
#endregion
#region
public class EventLogger : IThread
{
// 1
public EventLogger(){}
// 2
public EventLogger( ILogger logger )
{
_logger = logger;
_shouldStop = false;
_selfCheckInterval = 300000; // 5
}
#region Public Properties
public Queue Queue
{
get{ return _s_queue; }
}
public List Threads
{
get{ return _threads; }
}
#endregion
#region (Queue Method)
public void Enqueue( string s )
{
lock( _s_queue ){
_s_queue.Enqueue( s );
Monitor.Pulse( _s_queue );
}
}
public string Dequeue()
{
lock( _s_queue )
{
if( 1 > _s_queue.Count ){
Monitor.Wait( _s_queue );
}
return _s_queue.Dequeue();
}
}
#endregion
#region (Thread Method)
//
public Thread AddThread()
{
Thread t = new Thread( new ThreadStart(ThreadRun) );
t.IsBackground = true;
t.Start();
_threads.Add(t);
return t;
}
//
public void RemoveThread()
{
//
}
//
public void RequestThreadPause()
{
//
}
//
public void RequestThreadPause( bool pauseOrContinue )
{
//
}
//
public void RequestThreadStop()
{
_shouldStop = true;
}
//
public void AddSelfCheckThread()
{
if( false == _isSelfCheckRun ){
Thread t = new Thread( new ThreadStart(SelfCheck) );
t.IsBackground = true;
t.Start();
_isSelfCheckRun = true;
}
}
#endregion
#region Private Methods
//
private void ThreadRun()
{
if( null == _logger ){ return; }
while( !_shouldStop )
{
try{
_logger.Write( Dequeue() );
}
catch( Exception ex ){
Console.WriteLine( string.Format( " : {0}",ex.ToString() ) );
}
}
}
//
private void SelfCheck()
{
if( null == _logger ){ return; }
while( !_shouldStop )
{
try{
_logger.Write( " " );
Thread.Sleep( _selfCheckInterval );
}
catch( Exception ex ){
Console.WriteLine( string.Format( " : {0}",ex.ToString() ) );
}
}
}
#endregion
#region Private Members
private ILogger _logger = null; //
private static Queue _s_queue = new Queue(); // ( )
private List _threads = new List(); //
private bool _shouldStop;
private int _selfCheckInterval; //
private static bool _isSelfCheckRun = false;
#endregion
}
#endregion
} // end namespace My.WSE