Javaプログラミング——オープンソースフレームワークを応用して簡単なweb検索を実現する

24705 ワード

引用する
Javaのオープンソースライブラリを適用して、検索エンジンを作成し、このエンジンはウェブサイトの内容を這い出すことができます.そして、ページの内容に基づいて深く登り、すべての関連するページのアドレスと内容を取得し、ユーザーはキーワードを通じて、すべての関連するウェブサイトを検索することができます.
具体的な機能
(1)ユーザはurlに対応するページの内容を指定することができる.(2)ウェブページの内容を解析し,その中のurlリンクアドレスをすべて取得する.(3)ユーザは,初期urlに対応するページから,その中のすべてのurlに対応するページ内のurlを這い出すことができるように,這い出す深さを設定することができる.深さが大きいほど、登れるサイトが多くなります.(4)抽出したurlコンテンツを保存し,インデックスを確立する.インデックスを作成する内容はurlアドレス自体であり、urlに対応するページタイトルである.(5)ユーザは,キーワードを用いてウェブサイトを検索し,そのキーワードのあるurlアドレスを見つけることができる.(6)インデックスの作成とインデックスの検索の過程は中国語のキーワードを知能的に識別することができ、キーワードに対して分詞操作を行うことができる.(7)ユーザは,インデックスを保存するアドレス,初期url,登り深さ,検索を行うキーワード,最大マッチング項目を指定することができる.
オープンソースフレーム

Lucene

Jsoup

ソースコード
爬虫類部分:Spider.java

package webCrawler.Spider;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Scanner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import webCrawler.Index.BuildIndex;

/**
 * @author lannooo
 */

public class Spider {
    ArrayList URLs;
    private String startURL;
    private int digLevel;

    /**
     * @param startURL      URL
     * @param digLevel     
     */
    public Spider(String startURL, int digLevel){
        this.startURL = startURL;
        this.digLevel = digLevel;
        this.URLs = new ArrayList<>();
    }

    /**
     * @param level          
     * @param arrayList           URL 
     * @return    url       URL 
     * @throws IOException
     */
    public ArrayList getLevelURLs(int level, ArrayList arrayList) 
            throws IOException{
        ArrayList total = null;
        if(level>0){            
            total = new ArrayList<>();
            for(String url: arrayList){
                /*    arrayList  URL，         ，       URL */
                for(String each: getBareLinks(url)){
                    total.add(each);
                }
            }
            /* HashSet     total       */
            HashSet hashSet = new HashSet<>(total);
            total = new ArrayList<>(hashSet);
        }
        return total;
    }

    /**
     *  startURL  ，      URLs
     * @throws IOException
     */
    public void getAll() throws IOException{
        ArrayList newURLs;
        ArrayList currentURLs = new ArrayList<>();
        /* startURL  currentURLs     ，   url   */
        currentURLs.add(startURL);
        for(int i=digLevel; i>0; i--){
            /*
             *      ，         url     url 
             *              url     URL  
             *   newURLs                     
             */
            System.out.println("Dig into level: " + (digLevel-i+1));
            newURLs = getLevelURLs(i, currentURLs);
            for(String each: currentURLs){
                URLs.add(each);
            }
            currentURLs = newURLs;
        }
        for(String each:currentURLs){
            URLs.add(each);
        }
        HashSet hashSet = new HashSet<>(URLs);
        URLs = new ArrayList<>(hashSet);
    }

    /**
     * @param path        
     * @throws IOException
     */
    public void storeURLsAndInfo(String path) throws IOException{
        BuildIndex build = new BuildIndex(path);
        /*  URLs    url           */
        for(String each:URLs){
            String text = getLinkText(each);
            if(text!=null){
                build.addField("url", each);
                build.addField("text", text);
                /*    entry     */
                build.pushIndex();
            }
        }
        build.close();
    }

    /**
     * @param url          url
     * @return     
     * @throws IOException
     */
    public String getLinkText(String url) throws IOException{
        Document document = null;
        try {
            /* Jsoup    ，       3 */
            document = Jsoup.connect(url).timeout(3000).get();
        } catch (Exception e) {
            System.out.println("[TIMEOUT]Get title of url:"+url);
            return null;
        }
        String title = document.title();
        return title;
    }


    /**
     * @param url        url
     * @return    url         urls  
     * @throws IOException
     */
    public ArrayList getBareLinks(String url) throws IOException{
        ArrayList linksList = new ArrayList<>();
        Document document;

        try {
            document = Jsoup.connect(url).timeout(2000).get();
        } catch (Exception e) {
            return linksList;
        }
        /*         href   ラベル*/
        Elements links = document.select("body").select("a[href]");

        for(Element link: links){
            /*         ラベルからurlを  し、アンカー*/を  
            String href = link.attr("abs:href").replaceAll("#", "");
            /*     zju.edu.cn   url，     '/'*/
            if(href.contains("zju.edu.cn")){
                if (href.endsWith("/")){
                    href = href.substring(0, href.length()-1);
                }
                linksList.add(href);
            }
        }
        HashSet hashSet = new HashSet<>(linksList);
        ArrayList arrayList = new ArrayList<>(hashSet);

        return arrayList;
    }

    public static void main(String[] args) {
        Scanner in = new Scanner(System.in);
        System.out.println("Enter url:");
        String url = in.nextLine().trim();
        while(!url.startsWith("http://")){
            System.out.println("http:// is needed!");
            System.out.println("Enter url:");
            url = in.nextLine().trim();
        }
        System.out.println("Enter depth to dig more urls[<=3 recommended]：");
        int depth = in.nextInt();
        Spider spider = new Spider(url, depth);
        System.out.println("Enter path you want to save[default=d:/index-spider]:");
        String path = in.nextLine().trim();
        if(path.length()==0){
            path = "d:/index-spider";
        }
        try {
            System.out.println("Start fetching...");
            spider.getAll();
            System.out.println("Urls got success!");
            spider.storeURLsAndInfo(path);
            System.out.println("Stored success!");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}

索引の作成:BuildIndex.java

package webCrawler.Index;

import java.io.*;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author lannooo
 *
 */
public class BuildIndex {
    private File file;
    private Directory directory;
    private IndexWriter indexWriter;
    private IndexWriterConfig config;
    private Analyzer analyzer;
    private Document document;

    /**
     * @param path        
     */
    public BuildIndex(String path) {
        try {
            file = new File(path);
            directory = FSDirectory.open(file);
            document = new Document();
            analyzer = new IKAnalyzer();        /*       */
            config = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer);
            indexWriter = new IndexWriter(directory, config);           

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * @param fieldName    document         
     * @param fieldText        
     */
    public void addField(String fieldName, String fieldText){
        try{
            Field field = new TextField(fieldName, fieldText, Field.Store.YES);
            document.add(field);
        }catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     *  document      
     */
    public void pushIndex(){
        try {
            indexWriter.addDocument(document);
            document = new Document();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     *        document       
     * @param url    url  
     * @param text url     
     */
    public void addOneIndex(String url, String text){
        this.addField("url", url);
        this.addField("text", text);
        this.pushIndex();
    }

    /**
     *       
     */
    public void close(){
        try {
            indexWriter.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

索引の検索

package webCrawler.Index;

import java.io.File;
import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author lannooo
 *
 */
public class SearchIndex {
    private IndexSearcher indexSearcher;
    private Analyzer analyzer;
    private QueryParser parser;
    private Query query;
    private TopDocs hits;
    private DirectoryReader reader;

    /**
     * @param path          
     */
    public SearchIndex(String path){
        try {
            reader = DirectoryReader.open(FSDirectory.open(new File(path)));
            indexSearcher = new IndexSearcher(reader);
            analyzer = new IKAnalyzer();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * @param fieldName       
     * @param text      
     * @param matchNumber       
     * @return          
     */
    public int search(String fieldName, String text, int matchNumber){
        try {
            parser = new QueryParser(fieldName, analyzer);
            query = parser.parse(text);
            hits = indexSearcher.search(query, matchNumber);

            return hits.totalHits;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return -1;
    }
    /**
     *         
     */
    public void printHits(){
        try{
            System.out.println("Total hits number:"+hits.totalHits);
            for(ScoreDoc doc: hits.scoreDocs){
                Document document = indexSearcher.doc(doc.doc);
                System.out.println(document.get("url"));
                System.out.println(document.get("text"));
            }
            reader.close();
        }catch (Exception e) {
            e.printStackTrace();
        }
    }
    public static void main(String[] args) {
        /*     */
        Scanner in = new Scanner(System.in);
        System.out.println("Enter path of the index:");
        String path = in.nextLine().trim();
        while(path.length()==0){
            System.out.println("Enter path of the index:");
            path = in.nextLine().trim();
        }

        System.out.println("Enter max hit number:");
        int max = in.nextInt();
        while(max<0){
            System.out.println("Enter max hit number:");
            max = in.nextInt();
        }
        in.nextLine();
        System.out.print("Search>>> ");
        String text = in.nextLine().trim();
        /*          ，   q   ，   0   */
        while(!text.equals("q")){
            if(text.length()>0){
                SearchIndex search = new SearchIndex(path);
                int hits = search.search("text", text, max);
                if(hits!=-1){
                    search.printHits();
                }
            }
            System.out.print("Search>>> ");
            text = in.nextLine().trim();
        }
    }
}

UIインタフェース(ここでは、単にコマンドラインの形式であるため、必要に応じてGUIインタフェースを書くことができる)

package webCrawler.UI;

import java.util.Scanner;

import webCrawler.Index.SearchIndex;

/**
 * @author lannooo
 *
 */
public class UI {
    public static void main(String[] args) {
        /*     */
        Scanner in = new Scanner(System.in);
        System.out.print("Search>>> ");
        String text = in.nextLine().trim();
        /*        ，   q   ，   0   */
        while(!text.equals("q") && text.length()>0){
            SearchIndex search = new SearchIndex("d:/index-spider2");
            int hits = search.search("text", text, 20);
            if(hits!=-1){
                search.printHits();
            }
            System.out.print("Search>>> ");
            text = in.nextLine().trim();
        }
    }
}

【初学者向け】Nginxって？

Javaセキュリティの略語(Java Security Abbreviation)