GenBank filetype to Fasta filetype

10365 ワード

Bioinfo~~~
GenBank filetype to Fasta filetype
.gb -> .fasta
 
public class G2F {
 private String LOCUS =  "LOCUS       ",  LOCUS_S = "begin",//"SOURCE(name)"+"."+"LOCUS"   ORGANISM =   "  ORGANISM  ",  ORGANISM_S = "begin",   ACCESSION =  "ACCESSION   ",  ACCESSION_S = "begin",   VERSION =   "VERSION     ",  VERSION_S = "begin",   DEFINITION =  "DEFINITION  ",  DEFINITION_S = "begin",   ORIGIN =   "ORIGIN      ",  ORIGIN_S = "begin",   END_DATA =   "//";  String short_name = null, accession_name = null,   version_name = null, definition_name = null,   organism_name = null, warning_mess = null,   origin = null,   genbank_name = null,   sequence = "",   firstline = null, secondline =null;  public void resetState() {  LOCUS_S = "begin";  ORGANISM_S = "begin";  ACCESSION_S = "begin";  VERSION_S = "begin";  DEFINITION_S = "begin";  ORIGIN_S = "begin"; }  public void resetName() {  short_name = null;  accession_name = null;  version_name = null;  definition_name = null;  organism_name = null;  warning_mess = null;  genbank_name = null;  origin = null;  sequence = "";  firstline = null;  secondline =null; }  public void scan1squence(String inputfile, String outputfile) throws IOException {  BufferedReader in = new BufferedReader(new FileReader(inputfile));  PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outputfile)));    String sline = null;    /*String firstline = "";  String secondline = "";*/  while((sline = in.readLine()) != null){      if( ! sline.equals(END_DATA) && ! sline.equals("")){    String stemp = sline.substring(0, 12);    String resub = sline.substring(12);    //System.out.println(stemp);    if(stemp.equals(ORGANISM)){     organism_name = resub;     String s[] = resub.split("");     if(s[1].length() < 3){      s[1] = "XXX";     }     short_name = s[0].substring(0, 4)+ "_"+ s[1].substring(0,4);     ORGANISM_S = "done";    }    if(stemp.equals(ACCESSION)){     accession_name = resub;     ACCESSION_S = "done";    }    if(stemp.equals(VERSION)){     String s[] = resub.split("GI:");     version_name = s[0].trim();     genbank_name = s[1].trim();     //System.out.println(version_name);     VERSION_S = "done";    }    if(stemp.equals(DEFINITION)){     definition_name = resub;     DEFINITION_S = "done";    }    if(stemp.equals(ORIGIN)){     ORIGIN_S = "done";     //sline = in.readLine();    }    if( ORIGIN_S.equals("done") ){     while( ! (sline = in.readLine()).equals(END_DATA) ){      String tempsequence = sline.substring(10);      sequence += tempsequence.replace("","").toUpperCase();     }     ORIGIN_S = "nextS";     //System.out.println(sequence);    }    if(ORGANISM_S.equals("done") && ACCESSION_S.equals("done")       && VERSION_S.equals("done") && DEFINITION_S.equals("done")      && ORIGIN_S.equals("nextS"))    {     firstline = ">"+short_name+"."+accession_name+        "( "+version_name+"GI:"+genbank_name+") { "+definition_name+        "} [ "+organism_name+"]";     /*if(firstline.equals(">Cich_endi.EL372564 ( EL372564.1 GI:125358052 ) { CCEL5375.b1_N24.ab1 CCE(LMS) endive Cichorium endivia cDNA clone } [ Cichorium endivia ]"))      System.out.println("----------true-----------");*/     secondline = sequence;     /*System.out.println(firstline);     System.out.println(secondline);     if(secondline.equals("TATTCCAGAATCTCACACCTTTTTACACTAGCAGAAAGCCAGAAACACAGACCAACAACAGACGAGGAGGCACGAATTCCAACACAGAAAGGTTTTGTCTTCTCTTTCAACATCAAAGAGGGCTCTAGAAGCCCCCTGAGACCAAATCTTCAAACCAACATGGAATACCAAGCAAACTATTCAATTTGGGATGGTTTATACTACCATCCACACCTATTCGGTGGCATTATGCTAACAGTTGCATTGCTTGGTCTTTCCACAAGCTATCTAAGTGGCATAGCTGGCTTCCCTACTTTACCCTACATGTTACCTTATTTAGGAAACTTCCAGAAACAAAAAACCAACAAGAAACGTATCCGTGTGTACATGGATGGATGTTTTGATCTCATGCATTATGGTCACGCAAATGCTTTAAGACAAGCTAAAGCTTTAGGAGACGAATTAGTGGTTGGAATTGTAAGTGATGAAGAAATCATCAAGAACAAAGGTCCTCCTGTTTTATCAATGGAGGAAAGATTGGCACTTGTTAGTGGATTGAAGTGGGTTGATGAAGTTATTGCTAATGCACCTTATGCTATTACTGAAGACTTCATGAACAGTCTATTTAAAGAACATAAGATTGATTATATCATTCATGGAGATGATCCTTGTTTGCTTCCTGATGGAAGTGATGCATATGCTTTAGCAAAAAAAAAGTTGGTCGTTACAA"))      System.out.println("----------true-----------");*/     out.println(firstline);     out.println(secondline);     resetName();     resetState();     continue;    }   }   else if(sline.equals(END_DATA)){    //resetName();    resetState();   }     }    out.close();  in.close();   }}
 
 
public class G2F {

	private String LOCUS = 	"LOCUS       ", 	LOCUS_S = "begin", //"SOURCE(name)"+"." +"LOCUS"
			ORGANISM = 		"  ORGANISM  ", 	ORGANISM_S = "begin",
			ACCESSION = 	"ACCESSION   ", 	ACCESSION_S = "begin",
			VERSION = 		"VERSION     ", 	VERSION_S = "begin",
			DEFINITION = 	"DEFINITION  ", 	DEFINITION_S = "begin",
			ORIGIN = 		"ORIGIN      ",		ORIGIN_S = "begin",
			END_DATA = 		"//";
	
	String short_name = null, accession_name = null,
			version_name = null, definition_name = null,
			organism_name = null, warning_mess = null,
			origin = null,
			genbank_name = null,
			sequence = "",
			firstline = null, secondline =null;
	
	public void resetState()
	{
		LOCUS_S = "begin";
		ORGANISM_S = "begin";
		ACCESSION_S = "begin";
		VERSION_S = "begin";
		DEFINITION_S = "begin";
		ORIGIN_S = "begin";
	}
	
	public void resetName()
	{
		short_name = null;
		accession_name = null;
		version_name = null;
		definition_name = null;
		organism_name = null;
		warning_mess = null;
		genbank_name = null;
		origin = null;
		sequence = "";
		firstline = null;
		secondline =null;
	}
	
	public void scan1squence(String inputfile, String outputfile) throws IOException
	{
		BufferedReader in = new BufferedReader(new FileReader(inputfile));
		PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outputfile)));
		
		String sline = null;
		
		/*String firstline = "";
		String secondline = "";*/
		while((sline = in.readLine()) != null){
			
			if( ! sline.equals(END_DATA) && ! sline.equals("")){
				String stemp = sline.substring(0, 12);
				String resub = sline.substring(12);
				//System.out.println(stemp);
				if(stemp.equals(ORGANISM)){
					organism_name = resub;
					String s[] = resub.split(" ");
					if(s[1].length() < 3){
						s[1] = "XXX";
					}
					short_name = s[0].substring(0, 4)+ "_" + s[1].substring(0,4);
					ORGANISM_S = "done";
				}
				if(stemp.equals(ACCESSION)){
					accession_name = resub;
					ACCESSION_S = "done";
				}
				if(stemp.equals(VERSION)){
					String s[] = resub.split("GI:");
					version_name = s[0].trim();
					genbank_name = s[1].trim();
					//System.out.println(version_name);
					VERSION_S = "done";
				}
				if(stemp.equals(DEFINITION)){
					definition_name = resub;
					DEFINITION_S = "done";
				}
				if(stemp.equals(ORIGIN)){
					ORIGIN_S = "done";
					//sline = in.readLine();
				}
				if( ORIGIN_S.equals("done") ){
					while( ! (sline = in.readLine()).equals(END_DATA) ){
						String tempsequence = sline.substring(10);
						sequence += tempsequence.replace(" ","").toUpperCase();
					}
					ORIGIN_S = "nextS";
					//System.out.println(sequence);
				}
				if(ORGANISM_S.equals("done") && ACCESSION_S.equals("done") 
						&& VERSION_S.equals("done") && DEFINITION_S.equals("done")
						&& ORIGIN_S.equals("nextS"))
				{
					firstline = ">"+short_name+"."+accession_name+
								" ( "+version_name+" GI:"+genbank_name+" ) { "+definition_name+
								" } [ "+organism_name+" ]";
					/*if(firstline.equals(">Cich_endi.EL372564 ( EL372564.1 GI:125358052 ) { CCEL5375.b1_N24.ab1 CCE(LMS) endive Cichorium endivia cDNA clone } [ Cichorium endivia ]"))
						System.out.println("----------true-----------");*/
					secondline = sequence;
					/*System.out.println(firstline);
					System.out.println(secondline);
					if(secondline.equals("TATTCCAGAATCTCACACCTTTTTACACTAGCAGAAAGCCAGAAACACAGACCAACAACAGACGAGGAGGCACGAATTCCAACACAGAAAGGTTTTGTCTTCTCTTTCAACATCAAAGAGGGCTCTAGAAGCCCCCTGAGACCAAATCTTCAAACCAACATGGAATACCAAGCAAACTATTCAATTTGGGATGGTTTATACTACCATCCACACCTATTCGGTGGCATTATGCTAACAGTTGCATTGCTTGGTCTTTCCACAAGCTATCTAAGTGGCATAGCTGGCTTCCCTACTTTACCCTACATGTTACCTTATTTAGGAAACTTCCAGAAACAAAAAACCAACAAGAAACGTATCCGTGTGTACATGGATGGATGTTTTGATCTCATGCATTATGGTCACGCAAATGCTTTAAGACAAGCTAAAGCTTTAGGAGACGAATTAGTGGTTGGAATTGTAAGTGATGAAGAAATCATCAAGAACAAAGGTCCTCCTGTTTTATCAATGGAGGAAAGATTGGCACTTGTTAGTGGATTGAAGTGGGTTGATGAAGTTATTGCTAATGCACCTTATGCTATTACTGAAGACTTCATGAACAGTCTATTTAAAGAACATAAGATTGATTATATCATTCATGGAGATGATCCTTGTTTGCTTCCTGATGGAAGTGATGCATATGCTTTAGCAAAAAAAAAGTTGGTCGTTACAA"))
						System.out.println("----------true-----------");*/
					out.println(firstline);
					out.println(secondline);
					resetName();
					resetState();
					continue;
				}
			}
			else if(sline.equals(END_DATA)){
				//resetName();
				resetState();
			}
			
		}
		
		out.close();
		in.close();
		
	}
}