ページ、ファイルのリンクの抽出
2266 ワード
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PatternTest {
/**
* @param args
*/
public static void main(String[] args) {
String path="d:/test.txt";
PatternTest t = new PatternTest();
String regexhref = "<(?i)img.*?>";
String content = t.redFIle(path);
String sss= t.replaceHref(content,regexhref);
}
public String redFIle(String path) {
FileInputStream fin;
StringBuffer sb = new StringBuffer();
try {
fin = new FileInputStream(path);
InputStreamReader rdr = new InputStreamReader(fin, "utf-8");
BufferedReader br = new BufferedReader(rdr);//
String s;
while ((s = br.readLine()) != null) {
sb.append(s);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
public String replaceHref(String hBody, String regex) {
String url="";
String includ = "";
Pattern pt = Pattern.compile(regex,Pattern.DOTALL);
Matcher mt = pt.matcher(hBody);
while (mt.find()) {
includ = mt.group();
System.out.println(includ); // img
String srcregex = "src=(\".*?\")|src=(\'.*?\')|src=(.*?\\s)|src=(.*?>)";
url = getContents(includ, srcregex).trim();
//System.out.println(url);// src
}
return url;
}
// href
private String getContents(String contents,String regx){
String url="";
Pattern srcpt=Pattern.compile(regx);
Matcher srcmt=srcpt.matcher(contents);
while(srcmt.find()){
url= srcmt.group().replaceAll("src=\"|\"|src=\'|\'|src=|>", "");
}
return url;
}
}