Commit f0fa1dad authored by yihua.huang's avatar yihua.huang

clean some code

parent 01f49aad
...@@ -18,7 +18,7 @@ public interface PageProcessor { ...@@ -18,7 +18,7 @@ public interface PageProcessor {
/** /**
* the site the processor for * the site the processor for
* @return * @return site
*/ */
public Site getSite(); public Site getSite();
} }
...@@ -35,14 +35,14 @@ public interface Selectable { ...@@ -35,14 +35,14 @@ public interface Selectable {
/** /**
* select a link * select a link
* *
* @return * @return first link
*/ */
public Selectable a(); public Selectable a();
/** /**
* select all links * select all links
* *
* @return * @return all links
*/ */
public Selectable as(); public Selectable as();
......
package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
/**
* User: cairne
* Date: 13-4-20
* Time: 下午7:46
*/
public class SpiderTest {
@Test
public void testSpider() throws InterruptedException {
Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor());
me.run();
}
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
pageProcessor2.getSite().setEncoding("GBK");
System.out.println(pageProcessor2.getSite().getEncoding());
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")).
processor(pageProcessor2).run();
}
@Test
public void test(){
System.out.println(System.getProperty("java.io.tmpdir"));
}
@Ignore
@Test
public void languageSchema() {
/**
*
* _hrefs = rs("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}")
* title = r(""<title>(.*)</title>"")
* body = x("//dd[@class='w133']")
*
* site.domain = "sh.58.com"
* site.ua=""
* site.cookie="aa:bb"
*
*/
/**
*
*
* if (page == r('') && refer(1) == 1) {
*
* type = _refer(1)
* content = _text.t().c()
* title = x("asd@asd").r("",1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c())
*
* body=body[r(_currentUrl).g(1)]
* tags[%] = (tags[%] + xs('')) . r('')
*
* _targetUrls.add('' + x('').r(''))
* _sourceUrls.add()
* _header.put("","");
* _cookie.add("asdsadasdsa");
*
*
* }
*
* _cookie.add(_cookie[''])
*
* if (page == r('') && refer(1) == 1)
* (
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
*/
/**
* <condition></>
* <selector>
* <fields>
*
* <type>
* <selector></selector>
* <selector></selector>
* </type>
* </>
* </>
*/
/**
*
* if (model.url('') && model.refer(1) == 1)
* (
*
* model.set(type, model.refer(1))
* content = t(_html) > c()
* title = x(_html, 'asd@asd') > r('',1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('')
* tags[%] = tags + xs('') > r('')
* model.setTargetUrl();
*
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
* _cookie.add(_cookie[''])
*
* if (page == r('') && refer(1) == 1)
* (
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
*/
}
}
package us.codecraft.webmagic.selector;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.Test;
import java.io.IOException;
import java.net.URL;
/**
* User: cairne
* Date: 13-4-21
* Time: 上午10:35
*/
public class HtmlCleanerTest {
@Test
public void test() throws IOException {
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
TagNode node = htmlCleaner.clean(new URL("http://www.huanqiu.com"),"UTF-8");
System.out.println(node.getAllElementsList(true));
System.out.println(node);
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
</logger> </logger>
<root> <root>
<level value="debug" /> <level value="info" />
<appender-ref ref="stdout" /> <appender-ref ref="stdout" />
</root> </root>
......
package us.codecraft.webmagic;
import org.junit.Assert;
import org.junit.Test;
import us.codecraft.webmagic.selector.Html;
/**
* User: cairne
* Date: 13-4-21
* Time: 上午8:42
*/
public class HtmlTest {
@Test
public void testRegexSelector() {
Html selectable = new Html("aaaaaaab");
Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString()));
}
}
...@@ -15,12 +15,14 @@ import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; ...@@ -15,12 +15,14 @@ import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
public class SpiderTest { public class SpiderTest {
@Ignore
@Test @Test
public void testSpider() throws InterruptedException { public void testSpider() throws InterruptedException {
Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor());
me.run(); me.run();
} }
@Ignore
@Test @Test
public void testGlobalSpider(){ public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor(); // PageProcessor pageProcessor = new MeicanProcessor();
...@@ -35,6 +37,7 @@ public class SpiderTest { ...@@ -35,6 +37,7 @@ public class SpiderTest {
} }
@Ignore
@Test @Test
public void test(){ public void test(){
System.out.println(System.getProperty("java.io.tmpdir")); System.out.println(System.getProperty("java.io.tmpdir"));
......
package us.codecraft.webmagic.processor; package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
...@@ -16,6 +17,7 @@ import java.io.IOException; ...@@ -16,6 +17,7 @@ import java.io.IOException;
*/ */
public class DiandianProcessorTest { public class DiandianProcessorTest {
@Ignore
@Test @Test
public void test() throws IOException { public void test() throws IOException {
DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor(); DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor();
......
package us.codecraft.webmagic.processor; package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
...@@ -16,6 +17,7 @@ import java.io.IOException; ...@@ -16,6 +17,7 @@ import java.io.IOException;
*/ */
public class DiaoyuwengProcessorTest { public class DiaoyuwengProcessorTest {
@Ignore
@Test @Test
public void test() throws IOException { public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
......
package us.codecraft.webmagic.processor; package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FilePipeline;
...@@ -16,6 +17,7 @@ import java.io.IOException; ...@@ -16,6 +17,7 @@ import java.io.IOException;
*/ */
public class SinablogProcessorTest { public class SinablogProcessorTest {
@Ignore
@Test @Test
public void test() throws IOException { public void test() throws IOException {
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment