xml形式ファイル(約50 G)をjson形式に変換しmongodbデータベースにアップロード
, , 50G xml json , mongodb , ,
xml
import java.util.ArrayList;
import java.util.List;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import com.mongodb.DBObject;
/*
* @author
* @time 2015-11-8
* implements ContentHandler, ContentHandler startDocument()、endDocument()、startElement()、endElement()
* writeToMongoDB()、storeDBMongo()
*
*/
public class MyContentHandler implements ContentHandler {
private StringBuffer buf;
private String ctitle;
private String cns;
private String cid;
private String ctext;
private String ctimestamp;
private int idnumber=0;
List<Data> listdata=new ArrayList<Data>();
List list=new ArrayList();
@Override
public void setDocumentLocator(Locator locator) {
// TODO Auto-generated method stub
}
@Override
public void startDocument() throws SAXException {
// TODO Auto-generated method stub
buf=new StringBuffer();
System.out.println("******* *******");
}
@Override
public void endDocument() throws SAXException {
// TODO Auto-generated method stub
try {
writeToMongoDB();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("******* *******");
}
// MongoDB
private void writeToMongoDB() throws Exception {
// TODO Auto-generated method stub
List<DBObject> dblist=new ArrayList<DBObject>();
for(Data d:listdata){
dblist.add(BSONT.mapToBSON(d.toJSONMap()));
}
MongoDBT.writeListToMongo("IP", 27017,"databaseName", "collectionName", dblist);
}
@Override
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void endPrefixMapping(String prefix) throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
// TODO Auto-generated method stub
if(qName=="page"){
idnumber=1;
}
if(qName=="title"){
ctitle=qName;
}else if(qName=="ns"){
cns=qName;
}else if(qName=="id"&&idnumber==1){
cid=qName;
idnumber=0;
}else if(qName=="timestamp"){
ctimestamp=qName;
}else if(qName=="text"){
ctext=qName;
}
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
// TODO Auto-generated method stub
if(ctitle==qName){
String sss=buf.toString();
ctitle="";
list.add(sss);
buf.setLength(0);
}else if(cns==qName){
cns="";
String sss=buf.toString();
list.add(sss);
buf.setLength(0);
}else if(cid==qName){
cid="";
String sss=buf.toString();
list.add(sss);
buf.setLength(0);
}else if(ctimestamp==qName){
ctimestamp="";
String sss=buf.toString();
list.add(sss);
buf.setLength(0);
}else if(ctext==qName){
ctext="";
String sss=buf.toString();
list.add(sss);
buf.setLength(0);
// sss , “[[” “]]”, String index
if((sss.toUpperCase().contains("#REDIRECT")||sss.contains("# "))&&sss.contains("[[")&&sss.contains("]]")){
int i=sss.indexOf("[[");
int j=sss.indexOf("]]");
String s=sss.substring(i+2,j);
list.add(s);
list.add("redirect");
}else{
list.add("");
list.add("article");
}
}
if(qName=="page"){
storeDBMongo(list);
}
}
private void storeDBMongo(List lt) {
// TODO Auto-generated method stub
for(int i=0;i<list.size();i++){
System.out.println(lt.get(i));
}
try {
Data data=new Data();
data.setTitle(list.get(0).toString());
data.setNamespace(list.get(1).toString());
data.setId(list.get(2).toString());
data.setLastEsited(list.get(3).toString());
data.setMarkup(list.get(4).toString());
data.setTarget(list.get(5).toString());
data.setType(list.get(6).toString());
listdata.add(data);
if(listdata.size()>=300){
writeToMongoDB();
listdata.clear();
}
list.clear();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// TODO Auto-generated method stub
if(ctitle=="title"){
buf.append(new String(ch,start,length));
}else if(cns=="ns"){
buf.append(new String(ch,start,length));
}else if(cid=="id"){
buf.append(new String(ch,start,length));
list.add(new String(ch,start,length));
}else if(ctimestamp=="timestamp"){
buf.append(new String(ch,start,length));
}else if(ctext=="text"){
buf.append(new String(ch,start,length));
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void processingInstruction(String target, String data)
throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void skippedEntity(String name) throws SAXException {
// TODO Auto-generated method stub
}
}
カスタムクラスData、JSONT
import java.util.HashMap;
import java.util.Map;
public class Data {
private String id;
private String namespace;
private String type;
private String title;
private String markup;
private String lastEsited;
private String target;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getNamespace() {
return namespace;
}
public void setNamespace(String namespace) {
this.namespace = namespace;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getMarkup() {
return markup;
}
public void setMarkup(String markup) {
this.markup = markup;
}
public String getLastEsited() {
return lastEsited;
}
public void setLastEsited(String lastEsited) {
this.lastEsited = lastEsited;
}
public String getTarget() {
return target;
}
public void setTarget(String target) {
this.target = target;
}
public Map<String,Object> toJSONMap(){
Map<String,Object> jsonmap=new HashMap<String,Object>();
jsonmap.put("id", this.id);
jsonmap.put("namespace", this.namespace);
jsonmap.put("type", this.type);
jsonmap.put("title", this.title);
jsonmap.put("markup", this.markup);
jsonmap.put("lastEsited", this.lastEsited);
jsonmap.put("target", this.target);
return jsonmap;
}
}
/*
* NextMap-Crawler Module
*
* Copyright (C) 2002-2014,Institute of Geographic Sciences and Natural Resources Research,
* Chinese Academy of Sciences
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation;
* version 2.1 of the License.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*/
import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
/**
*
* @author zhuhaichuan
* @date 2015-11-8
*
*
*/
public class JSONT {
public static String mapToJSONString(Map map) {
StringWriter sw = new StringWriter();
try {
ObjectMapper mapper = new ObjectMapper();
mapper.writeValue(sw, map);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sw.toString();
}
/**
*
* @param list
* @return
*/
public static String listToJSONString(List list) {
StringWriter sw = new StringWriter();
try {
ObjectMapper mapper = new ObjectMapper();
mapper.writeValue(sw, list);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sw.toString();
}
/**
*
* @param list
* @return
*/
public static String beanToJSONString(Object bean) {
StringWriter sw = new StringWriter();
try {
ObjectMapper mapper = new ObjectMapper();
mapper.writeValue(sw, bean);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sw.toString();
}
/**
*
* @param jsonstr
* @return
*/
public static Map jsonToMap(String jsonstr) {
Map map = null;
try {
ObjectMapper mapper = new ObjectMapper();
map = mapper.readValue(jsonstr, Map.class);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return map;
}
/**
*
* @param jsonstr
* @return
*/
public static List jsonToList(String jsonstr) {
List list = null;
try {
ObjectMapper mapper = new ObjectMapper();
list = mapper.readValue(jsonstr, List.class);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
}
そしてクラスMyErrorHandlerimport org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
public class MyErrorHandler implements ErrorHandler {
@Override
public void warning(SAXParseException exception) throws SAXException {
// TODO Auto-generated method stub
System.out.println("*******WARNING******");
System.out.println(" :" + exception.getLineNumber());
System.out.println(" :" + exception.getColumnNumber());
System.out.println("exception :" + exception.getMessage());
System.out.println("********************");
}
@Override
public void error(SAXParseException exception) throws SAXException {
// TODO Auto-generated method stub
System.out.println("******* ERROR ******");
System.out.println(" :" + exception.getLineNumber());
System.out.println(" :" + exception.getColumnNumber());
System.out.println("exception :" + exception.getMessage());
System.out.println("********************");
}
@Override
public void fatalError(SAXParseException exception) throws SAXException {
// TODO Auto-generated method stub
System.out.println("******** FATAL ERROR ********");
System.out.println(" :" + exception.getLineNumber());
System.out.println(" :" + exception.getColumnNumber());
System.out.println("exception " + exception.getMessage());
System.out.println("*****************************");
}
}
カスタムMongoDBTクラス
import java.util.ArrayList;
import java.util.List;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
public class MongoDBT {
public static void writeListToMongo(String ip,int port,String dbname,String collname,List<DBObject> list) throws Exception{
Mongo mongo=new Mongo(ip,port);
DB db=mongo.getDB(dbname);
DBCollection collection=db.getCollection(collname);
List<DBObject> dblist=new ArrayList<DBObject>();
for(int i=0;i<list.size();i++){
dblist.add(list.get(i));
}
collection.insert(dblist);
mongo.close();
}
}