XULRunner with Java: JavaXPCOM Tutorial 4
33580 ワード
7.JavaXPCOMでWebページを自動閲覧する このセクションでは、Webページを自動的に閲覧するいくつかの問題を解決し、共通のタスクを人間が読むことができる方法に抽象化しようとします.そのため、コードを簡単に読んで機能を知ることができます.Webブラウザーを構築してWebページをロードしたり、ボタンやハイパーリンクをクリックしたりして、XPathを使って役に立つ情報を抽出します.各セクションでは、ブラウザに新しい機能を追加するので、最後にWebブラウザーがあり、Webページの自動閲覧を実現します.7.1ブラウザを使用してWebページをロード 例SimpleBrowserでは、ブラウザにurlをロードする方法public boolean setUrl(String url)を使用します.この方法の問題は、ブラウザがページをロードし始めることですが、ブラウザがページをロードするのを待つことはありません.この機能を実現するためにgoという方法を書いたので、後でこの方法を使用してページを安全にロードし、ページのロードが成功したりタイムアウトしたりするまで実行プロセスをブロックします. 注意:実装方法はseturlのときにCountDownLatchを定義し、browser.addProgressListenerを傍受し、ページが完了したらこのCountDownLatch countDownをクリックします.setuUrlを呼び出した後、CountDownLatchのawaitメソッドを使用してロードの完了またはタイムアウトを待機します.注意seturlを呼び出すときにSWTスレッドを起動する display.syncExec(new Runnable() { public void run() { browser.setUrl(url); } }); また、ロードされたページの内容も取得したい場合があります.すべての翻訳者が変数contentを追加しました.追加されたコードは次のとおりです. browser.addProgressListener(new ProgressListener() { public void changed(ProgressEvent event) { } public void completed(ProgressEvent event) { //When a page is loaded, decrement the latch, //which count will be 0 after this call. latch.countDown(); content=browser.getText();//added by LiLi } });
7.2 XPathを解析してW 3 Cノードを得る ブラウザにHTMLページをロードできるようになったら、DOMノードにアクセスして情報を抽出したいかもしれません.Mozilla NodeをW 3 C nodeに変換するには、前述のセクションを使用します.ここでは、標準的な方法でW 3 C Nodeを操作します.XPath EvaluatorとXPath resolverを作成してノードを抽出する方法を実装しました.Xpath evaluatorが戻ってくると結果として、各返されたDOM nodeを応答のW 3 C DOM elementに変換し、static Node getNodeInstance(nsIDOMNode node)を使用します.したがって、browserを使用すると、次の方法を直接呼び出すことができます.
以下は完全な例です:訳注:コアコードは以下の方法です.xpathとnsIDOMNodeの2つのパラメータがあり、XPathを満たすNodeのlistを返します.NodeはW 3 C Nodeです.実はxpcomのインタフェースを呼び出してxpath解析を行い、nsIDOMNodeのリストをW 3 C Nodeのlistに変換します.しかし、私は必要ないと感じています.nsIDOMNodeの属性はもっと多く、しかもW 3 CノードはnsIDOMNodeに戻ることができません.XPathを使用する場合は、まず絶対XPathでtableを見つけて、それから相対XPathに基づいてtr,tdなどを探します.2回目のxPathNodesを呼び出すパラメータnsIDOMNode contextは、最初に戻った結果のnsIDOMNodeです.
package es.ladyr.javaxpcom.browser;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.eclipse.swt.SWT;
import org.eclipse.swt.browser.ProgressEvent;
import org.eclipse.swt.browser.ProgressListener;
import org.eclipse.swt.widgets.Display;
import org.eclipse.swt.widgets.Shell;
import org.eclipse.swt.browser.Browser;
import org.eclipse.swt.SWTError;
public class SimpleBrowserWithGo {
// We will need SWT display to execute methods
// into the SWT event thread.
Browser browser;
private Display display;
// Latch used to manage page loading
// Uses a count of 1, so when the browser starts loading
// a page, we create a new latch, which will be
// decremented when the page is loaded.
private CountDownLatch latch;
// Default timeout to 60 seconds
private long defaultTimeout = 60000;
/**
* Creates a web browser which is able to load pages waiting until
* the page is completely loaded.
*
*/
public SimpleBrowserWithGo (final String xulrunnerPath) {
// Use a latch to wait for the browser initialization.
final CountDownLatch initLatch = new CountDownLatch(1);
// MozillaBrowser needs a window manager to work. We are using SWT
// for the graphical interface, so we need to execute MozillaBrowser
// methods into the SWT event thread. If we were use another thread,
// that methods could not work properly and throw an exception,
// breaking the execution flow and crashing our application.
new Thread("SWT-Event-Thread") {
@Override
public void run() {
display = new Display();
Shell shell = new Shell(display);
shell.setSize(800, 600);
shell.open();
// If you have XULRunner installed, you can call the constructor without
// the last parameter:
//
// final MozillaBrowser browser = new MozillaBrowser(shell,SWT.BORDER);
//
// That last parameter is the path for XULRunner files
// (where you have uncompressed downloaded XULRunner package).
try {
browser = new Browser(shell, SWT.MOZILLA);
} catch (SWTError e) {
System.out.println("Could not instantiate Browser: " + e.getMessage
());
return;
}
// Adapt browser size to shell size
browser.setBounds(shell.getClientArea());
// Listens for page loading status.
browser.addProgressListener(new ProgressListener() {
public void changed(ProgressEvent event) {
}
public void completed(ProgressEvent event) {
// When a page is loaded, decrement the latch,
// which count will be 0 after this call.
latch.countDown();
}
});
// Realease the initialization latch, which has value 1,
// so after this call its value will be 0.
initLatch.countDown();
while (!shell.isDisposed()) {
if (!display.readAndDispatch()) {
display.sleep();
}
}
System.exit(0);
}
}.start();
try {
// Waits until the initialization latch is released.
initLatch.await();
} catch (InterruptedException e) {
Thread.interrupted();
}
}
/**
* Loads an URL into the browser and waits until the page is
* totally loaded.
* @param url
* @throws SimpleBrowserException
*/
public void go(final String url) throws SimpleBrowserException {
// Creates a latch with count 1
latch = new CountDownLatch(1);
// Uses the SWT event thread to execute the method to
// load an URL in the browser.
display.syncExec(new Runnable() {
public void run() {
browser.setUrl(url);
}
});
// Waits for the finish of the page loading, or for a given
// timeout in case that the loading doesn't finish in a
// reasonable time.
boolean timeout = waitLoad(defaultTimeout);
if (timeout) {
throw new SimpleBrowserException("Timeout waiting page loading.");
}
}
private boolean waitLoad(long millis) {
try {
// Uses the latch, created by 'go' method to wait for
// the finish of the page loading (it will occurs when
// our 'progressListener' receives a event for its method
// 'completed'), or for a given timeout in case that the
// loading doesn't finish in a reasonable time.
boolean timeout;
timeout = !latch.await(millis,TimeUnit.MILLISECONDS);
if (timeout) {
// If the timeout expired, then we will stop
// page loading.
display.syncExec(new Runnable() {
public void run() {
browser.stop();
}
});
// Waits for the loading is stopped
latch.await(millis,TimeUnit.MILLISECONDS);
}
return timeout;
} catch (InterruptedException e) {
throw new Error(e);
}
}
public static void main(String[] args) {
String xulrunnerPath = null;
if ( args.length > 0 ) {
xulrunnerPath = args[0];
}
// Instantiate our simple web browser
SimpleBrowserWithGo simpleBrowser = new SimpleBrowserWithGo(xulrunnerPath);
try{
// Use the new functionality to load some URLs
// with our browser.
simpleBrowser.go("http://www.google.com");
Thread.sleep(3000);
simpleBrowser.go("http://www.urjc.es");
Thread.sleep(3000);
simpleBrowser.go("http://www.mozilla.org");
Thread.sleep(3000);
} catch (SimpleBrowserException e) {
System.err.println("Problems calling go method.");
e.printStackTrace();
} catch (InterruptedException e) {
System.err.println("Problems calling sleep.");
e.printStackTrace();
Thread.interrupted();
}
ime().halt(0);
}
}
7.2 XPathを解析してW 3 Cノードを得る ブラウザにHTMLページをロードできるようになったら、DOMノードにアクセスして情報を抽出したいかもしれません.Mozilla NodeをW 3 C nodeに変換するには、前述のセクションを使用します.ここでは、標準的な方法でW 3 C Nodeを操作します.XPath EvaluatorとXPath resolverを作成してノードを抽出する方法を実装しました.Xpath evaluatorが戻ってくると結果として、各返されたDOM nodeを応答のW 3 C DOM elementに変換し、static Node getNodeInstance(nsIDOMNode node)を使用します.したがって、browserを使用すると、次の方法を直接呼び出すことができます.
...
import org.w3c.dom.Node;
...
/**
*
* @param xpath
* @return a list with the nodes corresponding to a given xpath.
* @throws SimpleBrowserException
*/
public List<Node> xpathNodes(String xpath) {
...
/**
*
* @param <T>
* @param xpath
* @param nodeClass
* @return a list of <code>nodeClass</code> nodes corresponding
* to a given xpath.
* @throws SimpleBrowserException
*/
public <T extends Node> List<T> xpathNodes(String xpath, Class<T> nodeClass) {
...
以下は完全な例です:訳注:コアコードは以下の方法です.xpathとnsIDOMNodeの2つのパラメータがあり、XPathを満たすNodeのlistを返します.NodeはW 3 C Nodeです.実はxpcomのインタフェースを呼び出してxpath解析を行い、nsIDOMNodeのリストをW 3 C Nodeのlistに変換します.しかし、私は必要ないと感じています.nsIDOMNodeの属性はもっと多く、しかもW 3 CノードはnsIDOMNodeに戻ることができません.XPathを使用する場合は、まず絶対XPathでtableを見つけて、それから相対XPathに基づいてtr,tdなどを探します.2回目のxPathNodesを呼び出すパラメータnsIDOMNode contextは、最初に戻った結果のnsIDOMNodeです.
private List<Node> xPathNodes(String xpath, nsIDOMNode context) {
// Obtain the Mozilla DOM HTML document
HTMLDocumentImpl documentImpl = (HTMLDocumentImpl) getW3CDocument();
nsIDOMHTMLDocument document = documentImpl.getInstance();
// Creates a name space resolver for the document
nsIDOMXPathNSResolver res = xpathEval.createNSResolver(document);
List<Node> resultNodes = null;
// Evaluates given XPath in a given context, using the resolver created
// for the current document as an ordered iterator
nsISupports obj = xpathEval.evaluate(xpath, context, res,
nsIDOMXPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
// Obtain the interface corresponding to the XPath XPCOM results object
nsIDOMXPathResult result = (nsIDOMXPathResult) obj
.queryInterface(nsIDOMXPathResult.NS_IDOMXPATHRESULT_IID);
try {
// Extract result nodes for the XPath and add them
// to a standard List.
resultNodes = getNodes(result);
} catch (org.mozilla.xpcom.XPCOMException e) {
throw e;
}
return resultNodes;
}
package es.ladyr.javaxpcom.browser;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.eclipse.swt.SWT;
import org.eclipse.swt.SWTError;
import org.eclipse.swt.browser.Browser;
import org.eclipse.swt.browser.ProgressEvent;
import org.eclipse.swt.browser.ProgressListener;
import org.eclipse.swt.widgets.Display;
import org.eclipse.swt.widgets.Shell;
import org.mozilla.dom.NodeFactory;
import org.mozilla.dom.html.HTMLDocumentImpl;
import org.mozilla.interfaces.nsIComponentManager;
import org.mozilla.interfaces.nsIDOMDocument;
import org.mozilla.interfaces.nsIDOMHTMLDocument;
import org.mozilla.interfaces.nsIDOMNode;
import org.mozilla.interfaces.nsIDOMWindow;
import org.mozilla.interfaces.nsIDOMXPathEvaluator;
import org.mozilla.interfaces.nsIDOMXPathNSResolver;
import org.mozilla.interfaces.nsIDOMXPathResult;
import org.mozilla.interfaces.nsISupports;
import org.mozilla.interfaces.nsIWebBrowser;
import org.mozilla.xpcom.Mozilla;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLAnchorElement;
import org.w3c.dom.html.HTMLDocument;
public class SimpleBrowserWithXPath {
private final static String NS_IDOMXPATHEVALUATOR_CONTRACTID = "@mozilla.org/dom/xpath-evaluator;1";
private Browser browser;
// We will need SWT display to execute methods
// into the SWT event thread.
private Display display;
// Latch used to manage page loading
// Uses a count of 1, so when the browser starts loading
// a page, we create a new latch, which will be
// decremented when the page is loaded.
private CountDownLatch latch;
// Default timeout to 60 seconds
private long defaultTimeout = 60000;
// XPath evaluator
private nsIDOMXPathEvaluator xpathEval;
/**
* Creates a web browser which is able to load pages waiting until
* the page is completely loaded and solve xpaths returning
* the corresponding nodes.
*
*/
public SimpleBrowserWithXPath (final String xulrunnerPath) {
// Use a latch to wait for the browser initialization.
final CountDownLatch initLatch = new CountDownLatch(1);
// MozillaBrowser needs a window manager to work. We are using SWT
// for the graphical interface, so we need to execute MozillaBrowser
// methods into the SWT event thread. If we were use another thread,
// that methods could not work properly and throw an exception,
// breaking the execution flow and crashing our application.
new Thread("SWT-Event-Thread") {
@Override
public void run() {
display = new Display();
Shell shell = new Shell(display);
shell.setSize(800, 600);
shell.open();
// If you have XULRunner installed, you can call the constructor without
// the last parameter:
//
// final MozillaBrowser browser = new MozillaBrowser(shell,SWT.BORDER);
//
// That last parameter is the path for XULRunner files
// (where you have uncompressed downloaded XULRunner package).
try {
browser = new Browser(shell, SWT.MOZILLA);
} catch (SWTError e) {
System.out.println("Could not instantiate Browser: " + e.getMessage
());
return;
}
// Adapt browser size to shell size
browser.setBounds(shell.getClientArea());
// Listens for page loading status.
browser.addProgressListener(new ProgressListener() {
public void changed(ProgressEvent event) {
}
public void completed(ProgressEvent event) {
// When a page is loaded, decrement the latch,
// which count will be 0 after this call.
latch.countDown();
}
});
// Realease the initialization latch, which has value 1,
// so after this call its value will be 0.
initLatch.countDown();
while (!shell.isDisposed()) {
if (!display.readAndDispatch()) {
display.sleep();
}
}
System.exit(0);
}
}.start();
try {
// Waits until the initialization latch is released.
initLatch.await();
} catch (InterruptedException e) {
Thread.interrupted();
}
// Creates the XPath evaluator XPCOM component
Mozilla moz = Mozilla.getInstance();
nsIComponentManager componentManager = moz.getComponentManager();
xpathEval = (nsIDOMXPathEvaluator) componentManager.createInstanceByContractID(
NS_IDOMXPATHEVALUATOR_CONTRACTID, null,
nsIDOMXPathEvaluator.NS_IDOMXPATHEVALUATOR_IID);
}
/**
* Loads an URL into the browser and waits until the page is
* totally loaded.
* @param url
* @throws SimpleBrowserException
*/
public void go(final String url) throws SimpleBrowserException {
// Creates a latch with count 1
latch = new CountDownLatch(1);
// Uses the SWT event thread to execute the method to
// load an URL in the browser.
display.syncExec(new Runnable() {
public void run() {
browser.setUrl(url);
}
});
// Waits for the finish of the page loading, or for a given
// timeout in case that the loading doesn't finish in a
// reasonable time.
boolean timeout = waitLoad(defaultTimeout);
if (timeout) {
throw new SimpleBrowserException("Timeout waiting page loading.");
}
}
/**
*
* @return an W3C HTML Document implementation corresponding to
* the Mozilla DOM HTML document currently loaded in the browser.
* @throws SimpleBrowserException
*/
public HTMLDocument getW3CDocument() {
// System.out.println("El browser es " + browser.toString());
class DocumentGetter implements Runnable {
private nsIDOMHTMLDocument htmldoc;
public void run(){
nsIWebBrowser webBrowser = (nsIWebBrowser)browser.getWebBrowser();
if (webBrowser == null) {
System.out.println("Could not get the nsIWebBrowser from the Browser widget");
}
nsIDOMWindow dw = webBrowser.getContentDOMWindow();
nsIDOMDocument nsDoc = dw.getDocument();
htmldoc = (nsIDOMHTMLDocument) nsDoc
.queryInterface
(nsIDOMHTMLDocument.NS_IDOMHTMLDOCUMENT_IID);
}
public nsIDOMHTMLDocument getHtmldoc() {
// TODO Auto-generated method stub
return htmldoc;
}}
DocumentGetter dg = new DocumentGetter();
display.syncExec(dg);
return HTMLDocumentImpl.getDOMInstance(dg.getHtmldoc());
}
/**
*
* @param xpath
* @return a list with the nodes corresponding to a given xpath.
* @throws SimpleBrowserException
*/
public List<Node> xpathNodes(String xpath) {
return xPathNodes(xpath,
((HTMLDocumentImpl) getW3CDocument()).getInstance());
}
/**
*
* @param <T>
* @param xpath
* @param nodeClass
* @return a list of <code>nodeClass</code> nodes corresponding
* to a given xpath.
* @throws SimpleBrowserException
*/
public <T extends Node> List<T> xpathNodes(String xpath, Class<T> nodeClass) {
return (List<T>)xPathNodes(xpath,
((HTMLDocumentImpl) getW3CDocument()).getInstance());
}
private boolean waitLoad(long millis) {
try {
// Uses the latch, created by 'go' method to wait for
// the finish of the page loading (it will occurs when
// our 'progressListener' receives a event for its method
// 'completed'), or for a given timeout in case that the
// loading doesn't finish in a reasonable time.
boolean timeout;
timeout = !latch.await(millis,TimeUnit.MILLISECONDS);
if (timeout) {
// If the timeout expired, then we will stop
// page loading.
display.syncExec(new Runnable() {
public void run() {
browser.stop();
}
});
// Waits for the loading is stopped
latch.await(millis,TimeUnit.MILLISECONDS);
}
return timeout;
} catch (InterruptedException e) {
throw new Error(e);
}
}
private List<Node> xPathNodes(String xpath, nsIDOMNode context) {
// Obtain the Mozilla DOM HTML document
HTMLDocumentImpl documentImpl = (HTMLDocumentImpl) getW3CDocument();
nsIDOMHTMLDocument document = documentImpl.getInstance();
// Creates a name space resolver for the document
nsIDOMXPathNSResolver res = xpathEval.createNSResolver(document);
List<Node> resultNodes = null;
// Evaluates given XPath in a given context, using the resolver created
// for the current document as an ordered iterator
nsISupports obj = xpathEval.evaluate(xpath, context, res,
nsIDOMXPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
// Obtain the interface corresponding to the XPath XPCOM results object
nsIDOMXPathResult result = (nsIDOMXPathResult) obj.queryInterface(
nsIDOMXPathResult.NS_IDOMXPATHRESULT_IID);
try {
// Extract result nodes for the XPath and add them
// to a standard List.
resultNodes = getNodes(result);
} catch(org.mozilla.xpcom.XPCOMException e){
throw e;
}
return resultNodes;
}
private <T> List<T> getNodes(nsIDOMXPathResult result) {
List<T> nodes = new ArrayList<T>();
nsIDOMNode node;
while((node = result.iterateNext()) != null){
// Use the functionality provided by the mozdom4java
// (in our case, patched) library to obtain the corresponding
// W3C implementation of a node.
nodes.add((T)NodeFactory.getNodeInstance(node));
}
return nodes;
}
public static void main(String[] args) {
String xulrunnerPath = null;
if ( args.length > 0 ) {
xulrunnerPath = args[0];
}
// Instantiate our simple web browser
SimpleBrowserWithXPath simpleBrowser = new SimpleBrowserWithXPath(xulrunnerPath);
try{
// Load a page in the browser
simpleBrowser.go("http://www.google.com");
// Obtain a list of nodes, without a concrete class,
// because the XPath may return nodes of different
// types, so we work with them in a generic way.
List<Node> nodes = simpleBrowser.xpathNodes("//*");
for (Node node: nodes) {
System.out.println("Node Type: " + node.getNodeName()
+ " -- Content: " + node.getTextContent());
}
// Obtain a list of HTMLAnchorElements, because
// we can be sure about the result of our XPath,
// if it has any result, will be only of
// HTMLAnchorElement type.
for (HTMLAnchorElement a: simpleBrowser.xpathNodes(
"//a", HTMLAnchorElement.class)) {
System.out.println("Anchor: " + a.getHref());
}
} catch (SimpleBrowserException e) {
System.err.println("Problems calling go method.");
e.printStackTrace();
}
Runtime.getRuntime().halt(0);
}
}