xml形式ファイル(約50 G)をjson形式に変換しmongodbデータベースにアップロード


           ,       ,     50G xml          json     ,     mongodb    ,       ,    
  xml  
import java.util.ArrayList;
import java.util.List;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;

import com.mongodb.DBObject;
/*
 * @author 
 * @time 2015-11-8
 *     implements ContentHandler,      ContentHandler  startDocument()、endDocument()、startElement()、endElement()
 *        writeToMongoDB()、storeDBMongo()
 * 
 */

public class MyContentHandler implements ContentHandler {
	private StringBuffer buf;
	private String ctitle;
	private String cns;
	private String cid;
	private String ctext;
	private String ctimestamp;
	private int idnumber=0;
	List<Data> listdata=new ArrayList<Data>();
	List list=new ArrayList();		
	@Override
	public void setDocumentLocator(Locator locator) {
		// TODO Auto-generated method stub

	}

	@Override
	public void startDocument() throws SAXException {
		// TODO Auto-generated method stub
		buf=new StringBuffer();
        System.out.println("*******    *******");
	}

	@Override
	public void endDocument() throws SAXException {
		// TODO Auto-generated method stub
		try {
			writeToMongoDB();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		System.out.println("*******    *******");
		
	}
    //     MongoDB    
	private void writeToMongoDB() throws Exception {
		// TODO Auto-generated method stub
		List<DBObject> dblist=new ArrayList<DBObject>();
		for(Data d:listdata){
			dblist.add(BSONT.mapToBSON(d.toJSONMap()));
		}
		
		MongoDBT.writeListToMongo("IP", 27017,"databaseName", "collectionName", dblist);
	}

	@Override
	public void startPrefixMapping(String prefix, String uri)
			throws SAXException {
		// TODO Auto-generated method stub
	}

	@Override
	public void endPrefixMapping(String prefix) throws SAXException {
		// TODO Auto-generated method stub
	}

	@Override
	public void startElement(String uri, String localName, String qName,
			Attributes attributes) throws SAXException {
		// TODO Auto-generated method stub				
	       if(qName=="page"){
	    	   idnumber=1;
	       }
	       if(qName=="title"){
	    	   ctitle=qName;
	       }else if(qName=="ns"){
	    	   cns=qName;
	       }else if(qName=="id"&&idnumber==1){
	    	   cid=qName;
	    	   idnumber=0;
	       }else if(qName=="timestamp"){
	    	   ctimestamp=qName;	    		   
	       }else if(qName=="text"){
	     	   ctext=qName;
	       }
	       
	  }	        					
	@Override
	public void endElement(String uri, String localName, String qName)
			throws SAXException {
		// TODO Auto-generated method stub		
		if(ctitle==qName){
			String sss=buf.toString();
        	ctitle="";
        	list.add(sss);
 			buf.setLength(0);
 			
        }else if(cns==qName){
        	cns="";
        	String sss=buf.toString();       	
        	list.add(sss);
 			buf.setLength(0);
 			
        }else if(cid==qName){        	
        	cid="";
        	String sss=buf.toString();
        	list.add(sss);
 			buf.setLength(0);
 			
        }else if(ctimestamp==qName){
        	ctimestamp="";
        	String sss=buf.toString();
        	list.add(sss);
 			buf.setLength(0);
 			
        }else if(ctext==qName){
        	ctext="";
        	String sss=buf.toString();
        	list.add(sss);
 			buf.setLength(0);
 			//  sss         ,   “[[” “]]”,      String index        
        	if((sss.toUpperCase().contains("#REDIRECT")||sss.contains("#   "))&&sss.contains("[[")&&sss.contains("]]")){
        		int i=sss.indexOf("[[");
        		int j=sss.indexOf("]]");
        		String s=sss.substring(i+2,j);
        		list.add(s);
        		list.add("redirect");
        	}else{       		
        		list.add("");
        		list.add("article");
        	}
        }
			if(qName=="page"){
				storeDBMongo(list);								
			}
			   
	}

	private void storeDBMongo(List lt) {
		// TODO Auto-generated method stub
		for(int i=0;i<list.size();i++){
			System.out.println(lt.get(i));
			
		}
		try {
			Data data=new Data();
			data.setTitle(list.get(0).toString());
			data.setNamespace(list.get(1).toString());
			data.setId(list.get(2).toString());
			data.setLastEsited(list.get(3).toString());
			data.setMarkup(list.get(4).toString());
			data.setTarget(list.get(5).toString());
			data.setType(list.get(6).toString());
			listdata.add(data);
			if(listdata.size()>=300){
				writeToMongoDB();
				listdata.clear();
			}
			list.clear();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	@Override
	public void characters(char[] ch, int start, int length)
			throws SAXException {
		// TODO Auto-generated method stub
        if(ctitle=="title"){
        	buf.append(new String(ch,start,length));
        }else if(cns=="ns"){        
        	buf.append(new String(ch,start,length));       	
        }else if(cid=="id"){       	      	
        	buf.append(new String(ch,start,length));
        	list.add(new String(ch,start,length));       	
        }else if(ctimestamp=="timestamp"){
        	buf.append(new String(ch,start,length));        	
        }else if(ctext=="text"){
        	buf.append(new String(ch,start,length));        	
        }
        
	}

	@Override
	public void ignorableWhitespace(char[] ch, int start, int length)
			throws SAXException {
		// TODO Auto-generated method stub

	}

	@Override
	public void processingInstruction(String target, String data)
			throws SAXException {
		// TODO Auto-generated method stub

	}

	@Override
	public void skippedEntity(String name) throws SAXException {
		// TODO Auto-generated method stub

	}
	
}

カスタムクラスData、JSONT
import java.util.HashMap;
import java.util.Map;


public class Data {
	private String id;
	private String namespace;
	private String type;
	private String title;
	private String markup;
	private String lastEsited;
	private String target;
	public String getId() {
		return id;
	}
	public void setId(String id) {
		this.id = id;
	}
	public String getNamespace() {
		return namespace;
	}
	public void setNamespace(String namespace) {
		this.namespace = namespace;
	}
	public String getType() {
		return type;
	}
	public void setType(String type) {
		this.type = type;
	}
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	public String getMarkup() {
		return markup;
	}
	public void setMarkup(String markup) {
		this.markup = markup;
	}
	public String getLastEsited() {
		return lastEsited;
	}
	public void setLastEsited(String lastEsited) {
		this.lastEsited = lastEsited;
	}
	public String getTarget() {
		return target;
	}
	public void setTarget(String target) {
		this.target = target;
	}
	public Map<String,Object> toJSONMap(){
		Map<String,Object> jsonmap=new HashMap<String,Object>();
		jsonmap.put("id", this.id);
		jsonmap.put("namespace", this.namespace);
		jsonmap.put("type", this.type);
		jsonmap.put("title", this.title);
		jsonmap.put("markup", this.markup);
		jsonmap.put("lastEsited", this.lastEsited);
		jsonmap.put("target", this.target);		
		return jsonmap;
	}
	
}


/*
 *    NextMap-Crawler Module
 *    
 *    Copyright (C) 2002-2014,Institute of Geographic Sciences and Natural Resources Research,
 *    Chinese Academy of Sciences
 *
 *    This library is free software; you can redistribute it and/or
 *    modify it under the terms of the GNU Lesser General Public
 *    License as published by the Free Software Foundation;
 *    version 2.1 of the License.
 *
 *    This library is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *    Lesser General Public License for more details.
 */


import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import java.util.Map;

import com.fasterxml.jackson.databind.ObjectMapper;

/**
 * 
 * @author zhuhaichuan
 * @date 2015-11-8
 * 
 *   
 */

public class JSONT {
	public static String mapToJSONString(Map map) {
	    StringWriter sw = new StringWriter();
	    try {
	      ObjectMapper mapper = new ObjectMapper();
	      mapper.writeValue(sw, map);
	    } catch (IOException e) {
	      // TODO Auto-generated catch block
	      e.printStackTrace();
	    }
	    return sw.toString();
	  }

	  /**
	   * 
	   * @param list
	   * @return
	   */
	  public static String listToJSONString(List list) {
	    StringWriter sw = new StringWriter();
	    try {
	      ObjectMapper mapper = new ObjectMapper();
	      mapper.writeValue(sw, list);
	    } catch (IOException e) {
	      // TODO Auto-generated catch block
	      e.printStackTrace();
	    }
	    return sw.toString();
	  }

	  /**
	   * 
	   * @param list
	   * @return
	   */
	  public static String beanToJSONString(Object bean) {
	    StringWriter sw = new StringWriter();
	    try {
	      ObjectMapper mapper = new ObjectMapper();
	      mapper.writeValue(sw, bean);
	    } catch (IOException e) {
	      // TODO Auto-generated catch block
	      e.printStackTrace();
	    }
	    return sw.toString();
	  }

	  /**
	   * 
	   * @param jsonstr
	   * @return
	   */
	  public static Map jsonToMap(String jsonstr) {
	    Map map = null;
	    try {
	      ObjectMapper mapper = new ObjectMapper();
	      map = mapper.readValue(jsonstr, Map.class);
	    } catch (IOException e) {
	      // TODO Auto-generated catch block
	      e.printStackTrace();
	    }
	    return map;
	  }

	  /**
	   * 
	   * @param jsonstr
	   * @return
	   */
	  public static List jsonToList(String jsonstr) {
	    List list = null;
	    try {
	      ObjectMapper mapper = new ObjectMapper();
	      list = mapper.readValue(jsonstr, List.class);
	    } catch (IOException e) {
	      // TODO Auto-generated catch block
	      e.printStackTrace();
	    }
	    return list;
	  }
}
そしてクラスMyErrorHandler
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;


public class MyErrorHandler implements ErrorHandler {

	@Override
	public void warning(SAXParseException exception) throws SAXException {
		// TODO Auto-generated method stub
		System.out.println("*******WARNING******");
        System.out.println("  :" + exception.getLineNumber());
        System.out.println("  :" + exception.getColumnNumber());
        System.out.println("exception  :" + exception.getMessage());
        System.out.println("********************");
	}

	@Override
	public void error(SAXParseException exception) throws SAXException {
		// TODO Auto-generated method stub
		System.out.println("******* ERROR ******");
        System.out.println("  :" + exception.getLineNumber());
        System.out.println("  :" + exception.getColumnNumber());
        System.out.println("exception  :" + exception.getMessage());
        System.out.println("********************");
	}

	@Override
	public void fatalError(SAXParseException exception) throws SAXException {
		// TODO Auto-generated method stub
		System.out.println("******** FATAL ERROR ********");
        System.out.println("  :" + exception.getLineNumber());
        System.out.println("  :" + exception.getColumnNumber());
        System.out.println("exception  " + exception.getMessage());
        System.out.println("*****************************");
	}

}

カスタムMongoDBTクラス
import java.util.ArrayList;
import java.util.List;

import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;


public class MongoDBT {
	public static void writeListToMongo(String ip,int port,String dbname,String collname,List<DBObject> list) throws Exception{
		Mongo mongo=new Mongo(ip,port);		
		DB db=mongo.getDB(dbname);
		DBCollection collection=db.getCollection(collname);
		List<DBObject> dblist=new ArrayList<DBObject>();
		for(int i=0;i<list.size();i++){
			dblist.add(list.get(i));
		}
		collection.insert(dblist);
		mongo.close();
	}
}