Commit a0131293 authored by yihua.huang's avatar yihua.huang

#39 Parsing html after page.getHtml()

parent f63d33b4
...@@ -9,8 +9,8 @@ import java.util.ArrayList; ...@@ -9,8 +9,8 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
*
* Object storing extracted result and urls to fetch.<br> * Object storing extracted result and urls to fetch.<br>
* Not thread safe.<br>
* Main method: <br> * Main method: <br>
* {@link #getUrl()} get url of current page <br> * {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br> * {@link #getHtml()} get content of current page <br>
...@@ -19,9 +19,9 @@ import java.util.List; ...@@ -19,9 +19,9 @@ import java.util.List;
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br> * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.downloader.Downloader * @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor * @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/ */
public class Page { public class Page {
...@@ -31,6 +31,8 @@ public class Page { ...@@ -31,6 +31,8 @@ public class Page {
private Html html; private Html html;
private String rawText;
private Selectable url; private Selectable url;
private int statusCode; private int statusCode;
...@@ -62,9 +64,17 @@ public class Page { ...@@ -62,9 +64,17 @@ public class Page {
* @return html * @return html
*/ */
public Html getHtml() { public Html getHtml() {
if (html == null) {
html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
}
return html; return html;
} }
/**
* @param html
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/
public void setHtml(Html html) { public void setHtml(Html html) {
this.html = html; this.html = html;
} }
...@@ -95,7 +105,7 @@ public class Page { ...@@ -95,7 +105,7 @@ public class Page {
* *
* @param requests * @param requests
*/ */
public void addTargetRequests(List<String> requests,long priority) { public void addTargetRequests(List<String> requests, long priority) {
synchronized (targetRequests) { synchronized (targetRequests) {
for (String s : requests) { for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
...@@ -172,13 +182,22 @@ public class Page { ...@@ -172,13 +182,22 @@ public class Page {
this.statusCode = statusCode; this.statusCode = statusCode;
} }
public String getRawText() {
return rawText;
}
public void setRawText(String rawText) {
this.rawText = rawText;
}
@Override @Override
public String toString() { public String toString() {
return "Page{" + return "Page{" +
"request=" + request + "request=" + request +
", resultItems=" + resultItems + ", resultItems=" + resultItems +
", html=" + html + ", rawText='" + rawText + '\'' +
", url=" + url + ", url=" + url +
", statusCode=" + statusCode +
", targetRequests=" + targetRequests + ", targetRequests=" + targetRequests +
'}'; '}';
} }
......
...@@ -162,7 +162,7 @@ public class HttpClientDownloader implements Downloader { ...@@ -162,7 +162,7 @@ public class HttpClientDownloader implements Downloader {
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page(); Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setRawText(content);
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
......
...@@ -31,8 +31,7 @@ public class OschinaBlog { ...@@ -31,8 +31,7 @@ public class OschinaBlog {
private Date date; private Date date;
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().setSleepTime(0) OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
.addUrl("http://my.oschina.net/flashsword/blog").run(); .addUrl("http://my.oschina.net/flashsword/blog").run();
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment