Commit f0fa1dad authored by yihua.huang's avatar yihua.huang

clean some code

parent 01f49aad
......@@ -18,7 +18,7 @@ public interface PageProcessor {
/**
* the site the processor for
* @return
* @return site
*/
public Site getSite();
}
......@@ -35,14 +35,14 @@ public interface Selectable {
/**
* select a link
*
* @return
* @return first link
*/
public Selectable a();
/**
* select all links
*
* @return
* @return all links
*/
public Selectable as();
......
package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
/**
* User: cairne
* Date: 13-4-20
* Time: 下午7:46
*/
public class SpiderTest {
@Test
public void testSpider() throws InterruptedException {
Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor());
me.run();
}
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
pageProcessor2.getSite().setEncoding("GBK");
System.out.println(pageProcessor2.getSite().getEncoding());
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")).
processor(pageProcessor2).run();
}
@Test
public void test(){
System.out.println(System.getProperty("java.io.tmpdir"));
}
@Ignore
@Test
public void languageSchema() {
/**
*
* _hrefs = rs("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}")
* title = r(""<title>(.*)</title>"")
* body = x("//dd[@class='w133']")
*
* site.domain = "sh.58.com"
* site.ua=""
* site.cookie="aa:bb"
*
*/
/**
*
*
* if (page == r('') && refer(1) == 1) {
*
* type = _refer(1)
* content = _text.t().c()
* title = x("asd@asd").r("",1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c())
*
* body=body[r(_currentUrl).g(1)]
* tags[%] = (tags[%] + xs('')) . r('')
*
* _targetUrls.add('' + x('').r(''))
* _sourceUrls.add()
* _header.put("","");
* _cookie.add("asdsadasdsa");
*
*
* }
*
* _cookie.add(_cookie[''])
*
* if (page == r('') && refer(1) == 1)
* (
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
*/
/**
* <condition></>
* <selector>
* <fields>
*
* <type>
* <selector></selector>
* <selector></selector>
* </type>
* </>
* </>
*/
/**
*
* if (model.url('') && model.refer(1) == 1)
* (
*
* model.set(type, model.refer(1))
* content = t(_html) > c()
* title = x(_html, 'asd@asd') > r('',1)
* body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('')
* tags[%] = tags + xs('') > r('')
* model.setTargetUrl();
*
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
* _cookie.add(_cookie[''])
*
* if (page == r('') && refer(1) == 1)
* (
* _targetUrl = '' + x('') & r('')
* _sourceUrl = ''
* )
*
*/
}
}
package us.codecraft.webmagic.selector;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.Test;
import java.io.IOException;
import java.net.URL;
/**
* User: cairne
* Date: 13-4-21
* Time: 上午10:35
*/
public class HtmlCleanerTest {
@Test
public void test() throws IOException {
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
TagNode node = htmlCleaner.clean(new URL("http://www.huanqiu.com"),"UTF-8");
System.out.println(node.getAllElementsList(true));
System.out.println(node);
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -24,7 +24,7 @@
</logger>
<root>
<level value="debug" />
<level value="info" />
<appender-ref ref="stdout" />
</root>
......
package us.codecraft.webmagic;
import org.junit.Assert;
import org.junit.Test;
import us.codecraft.webmagic.selector.Html;
/**
* User: cairne
* Date: 13-4-21
* Time: 上午8:42
*/
public class HtmlTest {
@Test
public void testRegexSelector() {
Html selectable = new Html("aaaaaaab");
Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString()));
}
}
......@@ -15,12 +15,14 @@ import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
public class SpiderTest {
@Ignore
@Test
public void testSpider() throws InterruptedException {
Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor());
me.run();
}
@Ignore
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
......@@ -35,6 +37,7 @@ public class SpiderTest {
}
@Ignore
@Test
public void test(){
System.out.println(System.getProperty("java.io.tmpdir"));
......
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
......@@ -16,6 +17,7 @@ import java.io.IOException;
*/
public class DiandianProcessorTest {
@Ignore
@Test
public void test() throws IOException {
DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor();
......
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
......@@ -16,6 +17,7 @@ import java.io.IOException;
*/
public class DiaoyuwengProcessorTest {
@Ignore
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
......
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
......@@ -16,6 +17,7 @@ import java.io.IOException;
*/
public class SinablogProcessorTest {
@Ignore
@Test
public void test() throws IOException {
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment