C〓〓マルチスレッドウェブ採集器(Spider)


これはC〓〓言語の編纂のマルチスレッドのホームページの自動採集のプログラムです。メインクラスのコードを示します。完全コードはここでダウンロードしてください。
回転:http://www.cnblogs.com/closetome/articles/1711764.html
/**
        :
--------------------------------------------
  ->    ->         ->    ->  ->  
              (   )                       
     sql        (    )                       
      c#  



  UC1:     
----------------------------------------------
  : WSE  
  :     
     :    
       :
  ——   :       ,    URL,  ,    URL,      ,    URL   
  ——  :                 
    :             
    (     ):       ,       URL
     (     ):
  1.          URL,      (  ,    ,    ),      
  2.        
  3.        (     )
  4.         URL(     )
  5.        URL  (     )



PageInfo    
----------------------------
           | id
          | createdTime
          | modifiedTime
           | createdUser
           | modifiedUser
              |
            | URL
  128 MD5  | UrlMD5
IP          | IP
          | content
          | type
              |


     
-------------------------------

    :       UrlMD5
|-------------------------------------------------------------------------------------------------------|
|Spider                                                                                                |
|         |---------------------------------------------------------------------|    |----------------| |
|  |--|   |    (  )                         ------------------------------|--->|    UrlMD5  | |
|  | |   |      |---------|    |--------|    |---|------|     |--------|       |    |----------------| |
|  | |---|---|->| URL   |--->|     |--->|   |  |   |---->|     |---|   |                       |
|  | |   |   |  |---------|    |--------|    |------|---|     |--------|   |   |                       |
|  | |   |   |                                      |                      |   |                       |
|  |--|   |   |--------------------------------------|----------------------|   |    |----------------| |
|    |---|         | |
|    |    |---------------------------------------------------------------------|    |--------|-------| |
|    |                                                                                        |         |
|    |    |---------|                                                                         |         |
|    |---|     |                                                                                   |
|         |---------|                                                                                   |
|                                                                                                       |
|-------------------------------------------------------------------------------------------------------|


   
-------------------------------
    
  SpiderHandler --      

     
  Spider       
  PageInfo     --    
  Gatherer     --     
  Analyser     --url   

    
  IStorage         
  ISpiderUI      
  ILogger        

*/

//==================================================================
//
//           C#                    。
//          ,   ,    。
//
//    Copyright(C) 2009 themz.cn All rights reserved
//    author:   xml
//    email:    [email protected]
//    blog:     http://programmingcanruinyourlife.themz.cn/
//    since:    .net2.0
//    version:  1.0
//    created:  2009-08-06
//    modified: 2009-10-10
//
//         :          ,                 。
//                                     。
//                                        ,         。
//                      !
//
//==================================================================
//          :
//         1.           .cs      
//         2.  .net2.0          exe   ,    
//         3.   addSeeds        ,   : addSeeds http://url/
//         4.   start       
//         5.      getContents            
//         6. pause        , start     
//         7. stop       
//         8. exit        
//
//

using System;
using System.Collections.Generic;
using System.Data;
using System.Data.Common;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;

//using System.Configuration;
//using System.Diagnostics;
//[Serializable()]
namespace My.WSE.Spider
{
  #region       
    /**
         

          

          
          

          
          

      /      
               
               
    */
    public interface IThread
    {
        //T Queue{ get; }
        //List Threads{ get; }
        //
        //void Enqueue( T t );
        //T Dequeue();

        Thread AddThread();
        void RemoveThread();
        void RequestThreadPause();
        void RequestThreadPause( bool pauseOrContinue );
        void RequestThreadStop();
    }
  #endregion

  #region      
    //     
    public interface IGatherer
    {
        void Download( ref PageInfo info,string contentType,int timeout );
        void Download( ref PageInfo info,int timeout );
    }

    //     
    public interface IStorage
    {
        List GetIndexeds();                      //        URL MD5 

        List GetSeeds();
        int AddSeed( SeedInfo info );
        void RemoveSeed( SeedInfo info );

        void SaveContents( List info );        //        
    }

    //     
    public interface ILogger
    {
        void Write( string content );
        string Read( string filename );

        string ToString( Exception ex );
    }

  #endregion

  #region    
    public class ContentTypeException : Exception
    {
        public ContentTypeException( string message ) : base( message ){}
    }

    public class ContentSizeException : Exception
    {
        public ContentSizeException( string message ) : base( message ){}
    }

    public class NotOnlyException : Exception
    {
        public NotOnlyException( string message ) : base( message ){}
    }

    public class KeyHasExistsException : Exception
    {
        public KeyHasExistsException( string message ) : base( message ){}
    }
  #endregion

  #region PageInfo  
    public class PageQueue
    {
        //     1
        public PageQueue()
        {
            _queue = new LinkedList();
        }
        //     2
        public PageQueue( ref LinkedList queue ) : this()
        {
            if( null != queue ){
                _queue = queue;
            }
        }


      #region     
        public int Count
        {
            get{  return _queue.Count;  }
        }
        public bool Contains( PageInfo info )
        {
            return _queue.Contains( info.UrlMD5 );
        }
        public void Enqueue( PageInfo info )   //   AddLast
        {
            AddLast( info );
        }
        public PageInfo Dequeue()              //   RemoveFirst
        {
            return RemoveFirst();
        }

        public void AddFirst( PageInfo info )
        {
            lock( _queue ){
                _queue.AddFirst( info.UrlMD5 );
                AddData( info );
                Monitor.Pulse( _queue );
            }
        }
        public void AddLast( PageInfo info )
        {
            lock( _queue ){
                _queue.AddLast( info.UrlMD5 );
                AddData( info );
                Monitor.Pulse( _queue );
            }
        }
        public PageInfo RemoveFirst()
        {
            PageInfo info = null;
            lock( _queue ){
                LinkedListNode node = _queue.First;
                if( null == node ){
                    Monitor.Wait( _queue );
                    node = _queue.First;
                }

                string key = node.Value;
                _queue.RemoveFirst();
                info = GetData(key);
                RemoveData(key);    //         
            }
            return info;
        }
        public PageInfo RemoveLast()
        {
            PageInfo info = null;
            lock( _queue ){
                LinkedListNode node = _queue.First;
                if( null == node ){
                    Monitor.Wait( _queue );
                }
                else{
                    string key = node.Value;
                    _queue.RemoveFirst();
                    info = GetData(key);
                    RemoveData(key);    //         
                }
            }
            return info;
        }
        public PageInfo Remove( PageInfo info )
        {
            lock( _queue ){
                if( _queue.Remove(info.UrlMD5) ){
                    info = GetData(info.UrlMD5);
                    RemoveData(info.UrlMD5);    //         
                }
                else{
                    info = null;
                }
            }
            return info;
        }

        public Dictionary ToDictionary()
        {
            Dictionary dict = new Dictionary();

            lock( _queue ){
                LinkedListNode node = _queue.First;
                while( null != node ){
                    dict[node.Value] = GetData(node.Value);
                    node = node.Next;
                }
            }
            return dict;
        }
      #endregion

      #region     
        public PageInfo GetData( string key )
        {
            lock( _s_pages ){
                if( _s_pages.ContainsKey(key) ){
                    return _s_pages[key];
                }else{
                    _log.Enqueue( string.Format( "wse.spider.cs GetData,Dictionary {0}    ",key) );
                    return null;
                }
            }
        }
        public void AddData( PageInfo info )
        {
            lock( _s_pages ){
                _s_pages[info.UrlMD5] = info;
            }
        }
        public void RemoveData( string key )
        {
            lock( _s_pages ){
                if( _s_pages.ContainsKey(key) ){
                    _s_pages.Remove(key);
                }
            }
        }
        public bool ContainsData( PageInfo info )
        {
            return _s_pages.ContainsKey(info.UrlMD5);
        }
      #endregion

      #region Private Members

        private LinkedList _queue = null;
        private static Dictionary _s_pages = new Dictionary();

        private EventLogger _log = new EventLogger();
      #endregion
    }
  #endregion

  #region      
    public class PageGatherer : IThread
    {
      #region     
        //     1
        public PageGatherer(){}

        //     2
        public PageGatherer( IGatherer gather )
        {
            _log = new EventLogger();
            _store = new PageStorage();

            _gather = gather;
            _queue = new PageQueue();        //       
            _threads = new List();   //      

            _shouldPause = new ManualResetEvent(true);
            _shouldStop = false;
        }
      #endregion

      #region Public Property
        //       
        public Dictionary IndexedPool
        {
            get{ return _s_indexedPool; }
        }
        public PageQueue SeedQueue
        {
            get{  return _s_seedQueue;  }
        }

        //       
        public PageQueue Queue
        {
            get{  return _queue;  }
        }
        public List Threads
        {
            get{  return _threads;  }
        }
        //     
        public int ThreadCount
        {
            get{  return _threadCount;  }
        }
      #endregion

      #region     (Thread Method)
        //     
        public Thread AddThread()
        {
            Thread t = new Thread( new ThreadStart(ThreadRun) );
            t.IsBackground = true;
            t.Start();
            _threads.Add(t);
            _threadCount++;
            return t;
        }
        //     
        public void RemoveThread()
        {
            //     
        }
        //       
        public void RequestThreadPause()
        {

        }
        //       
        public void RequestThreadPause( bool pauseOrContinue )
        {
            if( !pauseOrContinue ){
                _shouldPause.Set();
            }else{
                _shouldPause.Reset();
            }
        }
        //       
        public void RequestThreadStop()
        {
            _shouldStop = true;
        }
      #endregion

      #region Private Methods
        //       
        private void ThreadRun()
        {
            PageInfo info = null;

            //   : URL->  ->  ->  ->|URL->  ....
            while( !_shouldStop )
            {
                _shouldPause.WaitOne();              //     
                if( _queue.Count < 1 ){
                    _queue.Enqueue( _s_seedQueue.Dequeue() );     //       
                }

                info = _queue.Dequeue();
                if( null == info ){  continue;  }

                //1   
                string url = info.URL;
                try{
                    _gather.Download(ref info,"text/html",90000);
                }
                catch( Exception ex ){
                    _log.Enqueue( info.URL + " " + ex.ToString() );
                    continue;
                }

                //2    url  _s_indexedPool
                AddIndexed( info.UrlMD5 );

                //3   :  _dataPool
                _store.Queue.Enqueue( info );

                //4   :      queue
                AnalyzeToQueue( info, ref _queue );
            }
        }
        //        url,         
        private void AnalyzeToQueue( PageInfo info, ref PageQueue queue )
        {
            PageQueue _queue = queue;

            List urls = Analyzer.ParseToURLs(info);
            PageInfo newInfo = null;

            for( int i=0,len=urls.Count; i _threads;          //      

        private ManualResetEvent _shouldPause;  //   
        private bool _shouldStop;               //   

        private static Dictionary _s_indexedPool = new Dictionary();      //     URL
        private static PageQueue _s_seedQueue = new PageQueue();   //     

        private static int _threadCount = 0;     //         
      #endregion
    }
  #endregion

  #region      
    public class PageStorage  : IThread
    {
      #region     
        //     1
        public PageStorage(){}

        //     2
        public PageStorage( IStorage store )
        {
            _log = new EventLogger();

            _store = store;
            _shouldStop = false;
        }
      #endregion

      #region Public Property
        //     
        public PageQueue Queue
        {
            get{ return _s_queue;  }
        }
        //       
        public List Threads
        {
            get{ return _threads;  }
        }
      #endregion

      #region     (Thread Method)
        //     
        public Thread AddThread()
        {
            Thread t = new Thread( new ThreadStart(ThreadRun) );
            t.IsBackground = true;
            t.Start();
            return t;
        }
        //     
        public void RemoveThread()
        {
            //     
        }
        //       
        public void RequestThreadPause()
        {
            //     
        }
        //       
        public void RequestThreadPause( bool pauseOrContinue )
        {
            //     
        }
        //       
        public void RequestThreadStop()
        {
            _shouldStop = true;
        }
      #endregion

      #region Private Methods
        //     
        private void ThreadRun()
        {
            if( null == _store ){ return;  }

            int count = 10;
            List infos = null;

            while( !_shouldStop )
            {
                infos = DequeueSome( count );
                try{
                    _store.SaveContents( infos );
                }
                catch( Exception ex ){
                    _log.Enqueue( ex.ToString() );
                }
            }
        }
        //     
        private List DequeueSome( int count )
        {
            List infos = new List();

            for( int i=0; i _threads = new List();   //  

        private bool _shouldStop;
      #endregion
    }
  #endregion

  #region      
    public class EventLogger : IThread
    {
        //     1
        public EventLogger(){}

        //     2
        public EventLogger( ILogger logger )
        {
            _logger = logger;
            _shouldStop = false;
            _selfCheckInterval = 300000;    // 5  
        }
      #region Public Properties
        public Queue Queue
        {
            get{  return _s_queue;  }
        }
        public List Threads
        {
            get{  return _threads;  }
        }
      #endregion

      #region     (Queue Method)
        public void Enqueue( string s )
        {
            lock( _s_queue ){
                _s_queue.Enqueue( s );
                Monitor.Pulse( _s_queue );
            }
        }
        public string Dequeue()
        {
            lock( _s_queue )
            {
                if( 1 > _s_queue.Count ){
                    Monitor.Wait( _s_queue );
                }
                return _s_queue.Dequeue();
            }
        }
      #endregion

      #region     (Thread Method)
        //
        public Thread AddThread()
        {
            Thread t = new Thread( new ThreadStart(ThreadRun) );
            t.IsBackground = true;
            t.Start();
            _threads.Add(t);
            return t;
        }
        //     
        public void RemoveThread()
        {
            //     
        }
        //       
        public void RequestThreadPause()
        {
            //     
        }
        //       
        public void RequestThreadPause( bool pauseOrContinue )
        {
            //     
        }
        //       
        public void RequestThreadStop()
        {
            _shouldStop = true;
        }
        //       
        public void AddSelfCheckThread()
        {
            if( false == _isSelfCheckRun ){
                Thread t = new Thread( new ThreadStart(SelfCheck) );
                t.IsBackground = true;
                t.Start();
                _isSelfCheckRun = true;
            }
        }
      #endregion

      #region Private Methods
        //        
        private void ThreadRun()
        {
            if( null == _logger ){ return;  }

            while( !_shouldStop )
            {
                try{
                    _logger.Write( Dequeue() );
                }
                catch( Exception ex ){
                    Console.WriteLine( string.Format( "  :        {0}",ex.ToString() ) );
                }
            }
        }
        //          
        private void SelfCheck()
        {
            if( null == _logger ){ return;  }

            while( !_shouldStop )
            {
                try{
                    _logger.Write( "      " );
                    Thread.Sleep( _selfCheckInterval );
                }
                catch( Exception ex ){
                    Console.WriteLine( string.Format( "  :        {0}",ex.ToString() ) );
                }
            }
        }
      #endregion

      #region Private Members
        private ILogger _logger = null;                               //   
        private static Queue _s_queue = new Queue();  //        (     )
        private List _threads = new List();           //            

        private bool _shouldStop;

        private int _selfCheckInterval;   //         
        private static bool _isSelfCheckRun = false;
      #endregion
    }
  #endregion

} // end namespace My.WSE