Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
webmagic
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
沈俊林
webmagic
Commits
e06b0c1a
Commit
e06b0c1a
authored
Sep 04, 2013
by
yihua.huang
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'xsoup'
parents
b9eeb88f
aefd0569
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
33 changed files
with
1283 additions
and
307 deletions
+1283
-307
pom.xml
pom.xml
+1
-1
pom.xml
webmagic-core/pom.xml
+7
-1
Page.java
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+4
-3
Spider.java
...agic-core/src/main/java/us/codecraft/webmagic/Spider.java
+9
-0
BaseElementSelector.java
...a/us/codecraft/webmagic/selector/BaseElementSelector.java
+23
-0
CssSelector.java
...main/java/us/codecraft/webmagic/selector/CssSelector.java
+11
-15
ElementSelector.java
.../java/us/codecraft/webmagic/selector/ElementSelector.java
+32
-0
Html.java
...re/src/main/java/us/codecraft/webmagic/selector/Html.java
+60
-12
PlainText.java
...c/main/java/us/codecraft/webmagic/selector/PlainText.java
+1
-13
Selectable.java
.../main/java/us/codecraft/webmagic/selector/Selectable.java
+0
-14
SelectorFactory.java
.../java/us/codecraft/webmagic/selector/SelectorFactory.java
+0
-91
Selectors.java
...c/main/java/us/codecraft/webmagic/selector/Selectors.java
+8
-12
TextContentSelector.java
...a/us/codecraft/webmagic/selector/TextContentSelector.java
+0
-68
XsoupSelector.java
...in/java/us/codecraft/webmagic/selector/XsoupSelector.java
+32
-0
EnvironmentUtil.java
...ain/java/us/codecraft/webmagic/utils/EnvironmentUtil.java
+28
-0
TextContentSelectorTest.java
.../codecraft/webmagic/selector/TextContentSelectorTest.java
+0
-34
EnvironmentUtilTest.java
...java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java
+18
-0
pom.xml
webmagic-extension/pom.xml
+1
-1
PageModelExtractor.java
.../java/us/codecraft/webmagic/model/PageModelExtractor.java
+22
-14
FilePageModelPipeline.java
...us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
+1
-1
ExtractorUtils.java
...main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+15
-7
pom.xml
webmagic-samples/pom.xml
+1
-1
DiaoyuwengProcessor.java
...va/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
+6
-1
F58PageProcesser.java
.../java/us/codecraft/webmagic/samples/F58PageProcesser.java
+7
-2
HuxiuProcessor.java
...in/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
+7
-3
InfoQMiniBookProcessor.java
...us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
+0
-4
IteyeBlogProcessor.java
...ava/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
+1
-2
KaichibaProcessor.java
...java/us/codecraft/webmagic/samples/KaichibaProcessor.java
+5
-0
MeicanProcessor.java
...n/java/us/codecraft/webmagic/samples/MeicanProcessor.java
+7
-2
OschinaBlogPageProcesser.java
.../codecraft/webmagic/samples/OschinaBlogPageProcesser.java
+4
-5
ProcessorBenchmark.java
.../src/test/java/us/codecraft/model/ProcessorBenchmark.java
+890
-0
pom.xml
webmagic-saxon/pom.xml
+5
-0
XpathSelectorTest.java
...ava/us/codecraft/webmagic/selector/XpathSelectorTest.java
+77
-0
No files found.
pom.xml
View file @
e06b0c1a
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
<version>
7
</version>
<version>
7
</version>
</parent>
</parent>
<groupId>
us.codecraft
</groupId>
<groupId>
us.codecraft
</groupId>
<version>
0.
2.2
-SNAPSHOT
</version>
<version>
0.
3.0
-SNAPSHOT
</version>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<packaging>
pom
</packaging>
<packaging>
pom
</packaging>
<properties>
<properties>
...
...
webmagic-core/pom.xml
View file @
e06b0c1a
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<parent>
<parent>
<groupId>
us.codecraft
</groupId>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.
2.2
-SNAPSHOT
</version>
<version>
0.
3.0
-SNAPSHOT
</version>
</parent>
</parent>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
...
@@ -25,6 +25,12 @@
...
@@ -25,6 +25,12 @@
<artifactId>
commons-lang3
</artifactId>
<artifactId>
commons-lang3
</artifactId>
</dependency>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
xsoup
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<dependency>
<groupId>
log4j
</groupId>
<groupId>
log4j
</groupId>
<artifactId>
log4j
</artifactId>
<artifactId>
log4j
</artifactId>
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
;
package
us
.
codecraft
.
webmagic
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.selector.Selectable
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.UrlUtils
;
...
@@ -28,7 +29,7 @@ public class Page {
...
@@ -28,7 +29,7 @@ public class Page {
private
ResultItems
resultItems
=
new
ResultItems
();
private
ResultItems
resultItems
=
new
ResultItems
();
private
Selectable
html
;
private
Html
html
;
private
Selectable
url
;
private
Selectable
url
;
...
@@ -58,11 +59,11 @@ public class Page {
...
@@ -58,11 +59,11 @@ public class Page {
*
*
* @return html
* @return html
*/
*/
public
Selectable
getHtml
()
{
public
Html
getHtml
()
{
return
html
;
return
html
;
}
}
public
void
setHtml
(
Selectable
html
)
{
public
void
setHtml
(
Html
html
)
{
this
.
html
=
html
;
this
.
html
=
html
;
}
}
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
View file @
e06b0c1a
...
@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
...
@@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
us.codecraft.webmagic.utils.ThreadUtils
;
import
java.io.Closeable
;
import
java.io.Closeable
;
...
@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
...
@@ -368,6 +369,14 @@ public class Spider implements Runnable, Task {
return
this
;
return
this
;
}
}
/**
* switch off xsoup
* @return
*/
public
static
void
xsoupOff
(){
EnvironmentUtil
.
setUseXsoup
(
false
);
}
@Override
@Override
public
String
getUUID
()
{
public
String
getUUID
()
{
if
(
uuid
!=
null
)
{
if
(
uuid
!=
null
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
java.util.List
;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public
abstract
class
BaseElementSelector
implements
Selector
,
ElementSelector
{
@Override
public
String
select
(
String
text
)
{
return
select
(
Jsoup
.
parse
(
text
));
}
@Override
public
List
<
String
>
selectList
(
String
text
)
{
return
selectList
(
Jsoup
.
parse
(
text
));
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.jsoup.select.Elements
;
...
@@ -15,7 +13,7 @@ import java.util.List;
...
@@ -15,7 +13,7 @@ import java.util.List;
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @since 0.1.0
*/
*/
public
class
CssSelector
implements
Selector
{
public
class
CssSelector
extends
BaseElement
Selector
{
private
String
selectorText
;
private
String
selectorText
;
...
@@ -30,16 +28,6 @@ public class CssSelector implements Selector {
...
@@ -30,16 +28,6 @@ public class CssSelector implements Selector {
this
.
attrName
=
attrName
;
this
.
attrName
=
attrName
;
}
}
@Override
public
String
select
(
String
text
)
{
Document
doc
=
Jsoup
.
parse
(
text
);
Elements
elements
=
doc
.
select
(
selectorText
);
if
(
CollectionUtils
.
isEmpty
(
elements
))
{
return
null
;
}
return
getValue
(
elements
.
get
(
0
));
}
private
String
getValue
(
Element
element
)
{
private
String
getValue
(
Element
element
)
{
if
(
attrName
==
null
)
{
if
(
attrName
==
null
)
{
return
element
.
outerHtml
();
return
element
.
outerHtml
();
...
@@ -51,9 +39,17 @@ public class CssSelector implements Selector {
...
@@ -51,9 +39,17 @@ public class CssSelector implements Selector {
}
}
@Override
@Override
public
List
<
String
>
selectList
(
String
text
)
{
public
String
select
(
Element
element
)
{
Elements
elements
=
element
.
select
(
selectorText
);
if
(
CollectionUtils
.
isEmpty
(
elements
))
{
return
null
;
}
return
getValue
(
elements
.
get
(
0
));
}
@Override
public
List
<
String
>
selectList
(
Element
doc
)
{
List
<
String
>
strings
=
new
ArrayList
<
String
>();
List
<
String
>
strings
=
new
ArrayList
<
String
>();
Document
doc
=
Jsoup
.
parse
(
text
);
Elements
elements
=
doc
.
select
(
selectorText
);
Elements
elements
=
doc
.
select
(
selectorText
);
if
(
CollectionUtils
.
isNotEmpty
(
elements
))
{
if
(
CollectionUtils
.
isNotEmpty
(
elements
))
{
for
(
Element
element
:
elements
)
{
for
(
Element
element
:
elements
)
{
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.nodes.Element
;
import
java.util.List
;
/**
* Selector(extractor) for html elements.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.0
*/
public
interface
ElementSelector
{
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param element
* @return result
*/
public
String
select
(
Element
element
);
/**
* Extract all results in text.<br>
*
* @param element
* @return results
*/
public
List
<
String
>
selectList
(
Element
element
);
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
us.codecraft.webmagic.utils.EnvironmentUtil
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
...
@@ -11,12 +15,23 @@ import java.util.List;
...
@@ -11,12 +15,23 @@ import java.util.List;
*/
*/
public
class
Html
extends
PlainText
{
public
class
Html
extends
PlainText
{
/**
* Store parsed document for better performance when only one text exist.
*/
private
Document
document
;
public
Html
(
List
<
String
>
strings
)
{
public
Html
(
List
<
String
>
strings
)
{
super
(
strings
);
super
(
strings
);
}
}
public
Html
(
String
text
)
{
public
Html
(
String
text
)
{
super
(
text
);
super
(
text
);
this
.
document
=
Jsoup
.
parse
(
text
);
}
public
Html
(
Document
document
)
{
super
(
document
.
html
());
this
.
document
=
document
;
}
}
public
static
Html
create
(
String
text
)
{
public
static
Html
create
(
String
text
)
{
...
@@ -53,38 +68,71 @@ public class Html extends PlainText {
...
@@ -53,38 +68,71 @@ public class Html extends PlainText {
@Override
@Override
public
Selectable
links
()
{
public
Selectable
links
()
{
XpathSelector
xpathSelector
=
Selectors
.
xpath
(
"//a/@href"
);
return
xpath
(
"//a/@href"
);
return
selectList
(
xpathSelector
,
strings
);
}
}
@Override
@Override
public
Selectable
xpath
(
String
xpath
)
{
public
Selectable
xpath
(
String
xpath
)
{
XpathSelector
xpathSelector
=
Selectors
.
xpath
(
xpath
);
if
(
EnvironmentUtil
.
useXsoup
())
{
return
selectList
(
xpathSelector
,
strings
);
XsoupSelector
xsoupSelector
=
new
XsoupSelector
(
xpath
);
if
(
document
!=
null
)
{
return
new
Html
(
xsoupSelector
.
selectList
(
document
));
}
return
selectList
(
xsoupSelector
,
strings
);
}
else
{
XpathSelector
xpathSelector
=
new
XpathSelector
(
xpath
);
return
selectList
(
xpathSelector
,
strings
);
}
}
}
@Override
@Override
public
Selectable
$
(
String
selector
)
{
public
Selectable
$
(
String
selector
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
CssSelector
cssSelector
=
Selectors
.
$
(
selector
);
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
return
selectList
(
cssSelector
,
strings
);
return
selectList
(
cssSelector
,
strings
);
}
}
@Override
@Override
public
Selectable
$
(
String
selector
,
String
attrName
)
{
public
Selectable
$
(
String
selector
,
String
attrName
)
{
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
CssSelector
cssSelector
=
Selectors
.
$
(
selector
,
attrName
);
if
(
document
!=
null
)
{
return
new
Html
(
cssSelector
.
selectList
(
document
));
}
return
selectList
(
cssSelector
,
strings
);
return
selectList
(
cssSelector
,
strings
);
}
}
@Override
public
Document
getDocument
()
{
public
Selectable
text
()
{
return
document
;
TextContentSelector
selector
=
Selectors
.
text
();
return
select
(
selector
,
strings
);
}
}
@Override
public
String
getText
()
{
public
Selectable
text
(
String
newlineSeparator
)
{
if
(
strings
!=
null
&&
strings
.
size
()>
0
){
TextContentSelector
selector
=
Selectors
.
text
(
newlineSeparator
);
return
strings
.
get
(
0
);
return
select
(
selector
,
strings
);
}
return
document
.
html
();
}
/**
* @param selector
* @return
*/
public
String
selectDocument
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
select
(
getDocument
());
}
else
{
return
selector
.
select
(
getText
());
}
}
}
public
List
<
String
>
selectDocumentForList
(
Selector
selector
)
{
if
(
selector
instanceof
ElementSelector
)
{
ElementSelector
elementSelector
=
(
ElementSelector
)
selector
;
return
elementSelector
.
selectList
(
getDocument
());
}
else
{
return
selector
.
selectList
(
getText
());
}
}
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
View file @
e06b0c1a
...
@@ -89,7 +89,7 @@ public class PlainText implements Selectable {
...
@@ -89,7 +89,7 @@ public class PlainText implements Selectable {
@Override
@Override
public
Selectable
replace
(
String
regex
,
String
replacement
)
{
public
Selectable
replace
(
String
regex
,
String
replacement
)
{
ReplaceSelector
replaceSelector
=
SelectorFactory
.
getInstatnce
().
newReplaceSelector
(
regex
,
replacement
);
ReplaceSelector
replaceSelector
=
new
ReplaceSelector
(
regex
,
replacement
);
return
select
(
replaceSelector
,
strings
);
return
select
(
replaceSelector
,
strings
);
}
}
...
@@ -107,18 +107,6 @@ public class PlainText implements Selectable {
...
@@ -107,18 +107,6 @@ public class PlainText implements Selectable {
}
}
}
}
@Override
public
Selectable
text
()
{
//do nothing
return
this
;
}
@Override
public
Selectable
text
(
String
newlineSeparator
)
{
//do nothing
return
this
;
}
@Override
@Override
public
boolean
match
()
{
public
boolean
match
()
{
return
strings
!=
null
&&
strings
.
size
()
>
0
;
return
strings
!=
null
&&
strings
.
size
()
>
0
;
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
View file @
e06b0c1a
...
@@ -82,20 +82,6 @@ public interface Selectable {
...
@@ -82,20 +82,6 @@ public interface Selectable {
*/
*/
public
String
toString
();
public
String
toString
();
/**
* select text content of html
*
* @return text
*/
public
Selectable
text
();
/**
* select text content of html
*
* @return text
*/
public
Selectable
text
(
String
newlineSeparator
);
/**
/**
* if result exist for select
* if result exist for select
*
*
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java
deleted
100644 → 0
View file @
b9eeb88f
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.apache.commons.lang3.StringUtils
;
import
java.lang.reflect.Constructor
;
import
java.util.Map
;
import
java.util.concurrent.ConcurrentHashMap
;
/**
* Selector factory with some inner cache.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public
class
SelectorFactory
{
private
Map
<
String
,
Selector
>
innerCache
=
new
ConcurrentHashMap
<
String
,
Selector
>();
private
static
final
SelectorFactory
INSTATNCE
=
new
SelectorFactory
();
public
static
SelectorFactory
getInstatnce
()
{
return
INSTATNCE
;
}
public
RegexSelector
newRegexSelector
(
String
regex
)
{
return
newSelector
(
RegexSelector
.
class
,
regex
);
}
public
RegexSelector
newRegexSelector
(
String
regex
,
int
group
)
{
String
cacheKey
=
getCacheKey
(
RegexSelector
.
class
,
regex
,
String
.
valueOf
(
group
));
if
(
innerCache
.
get
(
cacheKey
)
!=
null
)
{
return
(
RegexSelector
)
innerCache
.
get
(
cacheKey
);
}
return
new
RegexSelector
(
regex
,
group
);
}
public
ReplaceSelector
newReplaceSelector
(
String
regex
,
String
replacement
)
{
return
newSelector
(
ReplaceSelector
.
class
,
regex
,
replacement
);
}
public
XpathSelector
newXpathSelector
(
String
xpath
)
{
return
newSelector
(
XpathSelector
.
class
,
xpath
);
}
public
SmartContentSelector
newSmartContentSelector
()
{
return
newSelector
(
SmartContentSelector
.
class
);
}
public
<
T
extends
Selector
>
T
newAndCacheSelector
(
Class
<
T
>
clazz
,
String
...
param
)
{
String
cacheKey
=
getCacheKey
(
RegexSelector
.
class
,
param
);
if
(
innerCache
.
get
(
cacheKey
)
!=
null
)
{
return
(
T
)
innerCache
.
get
(
cacheKey
);
}
T
selector
=
newSelector
(
clazz
,
param
);
if
(
selector
!=
null
)
{
innerCache
.
put
(
cacheKey
,
selector
);
}
return
selector
;
}
public
<
T
extends
Selector
>
T
newSelector
(
Class
<
T
>
clazz
,
String
...
param
)
{
try
{
if
(
param
.
length
==
0
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
();
T
selector
=
constructor
.
newInstance
();
return
selector
;
}
else
if
(
param
.
length
==
1
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
(
String
.
class
);
T
selector
=
constructor
.
newInstance
(
param
[
0
]);
return
selector
;
}
else
if
(
param
.
length
==
2
)
{
Constructor
<
T
>
constructor
=
clazz
.
getConstructor
(
String
.
class
,
String
.
class
);
T
selector
=
constructor
.
newInstance
(
param
[
0
],
param
[
1
]);
return
selector
;
}
else
{
throw
new
UnsupportedOperationException
();
}
}
catch
(
Exception
e
)
{
throw
new
IllegalArgumentException
(
"init object error"
,
e
);
}
}
private
String
getCacheKey
(
Class
<?>
clazz
,
String
...
param
)
{
return
clazz
.
toString
()
+
"_"
+
StringUtils
.
join
(
param
,
"_"
);
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
View file @
e06b0c1a
...
@@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector;
...
@@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector;
public
abstract
class
Selectors
{
public
abstract
class
Selectors
{
public
static
RegexSelector
regex
(
String
expr
)
{
public
static
RegexSelector
regex
(
String
expr
)
{
return
SelectorFactory
.
getInstatnce
().
new
RegexSelector
(
expr
);
return
new
RegexSelector
(
expr
);
}
}
public
static
RegexSelector
regex
(
String
expr
,
int
group
)
{
public
static
RegexSelector
regex
(
String
expr
,
int
group
)
{
return
SelectorFactory
.
getInstatnce
().
newRegexSelector
(
expr
,
group
);
return
new
RegexSelector
(
expr
,
group
);
}
}
public
static
SmartContentSelector
smartContent
()
{
public
static
SmartContentSelector
smartContent
()
{
return
SelectorFactory
.
getInstatnce
().
new
SmartContentSelector
();
return
new
SmartContentSelector
();
}
}
public
static
CssSelector
$
(
String
expr
)
{
public
static
CssSelector
$
(
String
expr
)
{
...
@@ -29,7 +29,11 @@ public abstract class Selectors {
...
@@ -29,7 +29,11 @@ public abstract class Selectors {
}
}
public
static
XpathSelector
xpath
(
String
expr
)
{
public
static
XpathSelector
xpath
(
String
expr
)
{
return
SelectorFactory
.
getInstatnce
().
newXpathSelector
(
expr
);
return
new
XpathSelector
(
expr
);
}
public
static
XsoupSelector
xsoup
(
String
expr
)
{
return
new
XsoupSelector
(
expr
);
}
}
public
static
AndSelector
and
(
Selector
...
selectors
)
{
public
static
AndSelector
and
(
Selector
...
selectors
)
{
...
@@ -40,14 +44,6 @@ public abstract class Selectors {
...
@@ -40,14 +44,6 @@ public abstract class Selectors {
return
new
OrSelector
(
selectors
);
return
new
OrSelector
(
selectors
);
}
}
public
static
TextContentSelector
text
()
{
return
new
TextContentSelector
();
}
public
static
TextContentSelector
text
(
String
newlineSeperator
)
{
return
new
TextContentSelector
(
newlineSeperator
);
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
s
=
"a"
;
String
s
=
"a"
;
or
(
regex
(
"<title>(.*)</title>"
),
xpath
(
"//title"
),
$
(
"title"
)).
select
(
s
);
or
(
regex
(
"<title>(.*)</title>"
),
xpath
(
"//title"
),
$
(
"title"
)).
select
(
s
);
...
...
webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java
deleted
100644 → 0
View file @
b9eeb88f
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
java.util.Arrays
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Set
;
/**
* Extract text content in html.<br>
* Algorithm from <a href="http://www.elias.cn/En/ExtMainText">http://www.elias.cn/En/ExtMainText</a>. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public
class
TextContentSelector
implements
Selector
{
private
String
newLineSeperator
=
"\n"
;
public
TextContentSelector
()
{
}
public
TextContentSelector
(
String
newLineSeperator
)
{
this
.
newLineSeperator
=
newLineSeperator
;
}
private
final
static
Set
<
String
>
TAGS_IN_NEWLINE
=
new
HashSet
<
String
>();
private
final
static
Set
<
String
>
TAGS_TO_IGNORE
=
new
HashSet
<
String
>();
static
{
TAGS_IN_NEWLINE
.
addAll
(
Arrays
.
asList
(
new
String
[]{
"p"
,
"div"
,
"h1"
,
"h2"
,
"h3"
,
"h4"
,
"h5"
,
"h6"
,
"br"
,
"li"
}));
TAGS_TO_IGNORE
.
addAll
(
Arrays
.
asList
(
new
String
[]{
"head"
,
"style"
,
"script"
,
"noscript"
,
"option"
}));
}
@Override
public
String
select
(
String
text
)
{
Document
doc
=
Jsoup
.
parse
(
text
);
return
select0
(
doc
);
}
protected
String
select0
(
Element
element
)
{
String
tagName
=
element
.
tagName
().
toLowerCase
();
if
(
TAGS_TO_IGNORE
.
contains
(
tagName
))
{
return
""
;
}
StringBuilder
textBuilder
=
new
StringBuilder
();
textBuilder
.
append
(
element
.
text
());
if
(
element
.
children
()
!=
null
)
{
for
(
Element
child
:
element
.
children
())
{
textBuilder
.
append
(
select0
(
child
));
}
}
if
(
TAGS_IN_NEWLINE
.
contains
(
tagName
))
{
textBuilder
.
append
(
newLineSeperator
);
}
return
textBuilder
.
toString
();
}
@Override
public
List
<
String
>
selectList
(
String
text
)
{
throw
new
UnsupportedOperationException
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.jsoup.nodes.Element
;
import
us.codecraft.xsoup.XPathEvaluator
;
import
us.codecraft.xsoup.Xsoup
;
import
java.util.List
;
/**
* XPath selector based on Xsoup.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.0
*/
public
class
XsoupSelector
extends
BaseElementSelector
{
private
XPathEvaluator
xPathEvaluator
;
public
XsoupSelector
(
String
xpathStr
)
{
this
.
xPathEvaluator
=
Xsoup
.
compile
(
xpathStr
);
}
@Override
public
String
select
(
Element
element
)
{
return
xPathEvaluator
.
evaluate
(
element
).
get
();
}
@Override
public
List
<
String
>
selectList
(
Element
element
)
{
return
xPathEvaluator
.
evaluate
(
element
).
list
();
}
}
webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.apache.commons.lang3.BooleanUtils
;
import
java.util.Properties
;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public
abstract
class
EnvironmentUtil
{
private
static
final
String
USE_XSOUP
=
"xsoup"
;
public
static
boolean
useXsoup
()
{
Properties
properties
=
System
.
getProperties
();
Object
o
=
properties
.
get
(
USE_XSOUP
);
if
(
o
==
null
)
{
return
true
;
}
return
BooleanUtils
.
toBoolean
(((
String
)
o
).
toLowerCase
());
}
public
static
void
setUseXsoup
(
boolean
useXsoup
)
{
Properties
properties
=
System
.
getProperties
();
properties
.
setProperty
(
USE_XSOUP
,
BooleanUtils
.
toString
(
useXsoup
,
"true"
,
"false"
));
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java
deleted
100644 → 0
View file @
b9eeb88f
package
us
.
codecraft
.
webmagic
.
selector
;
import
junit.framework.Assert
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
/**
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public
class
TextContentSelectorTest
{
@Test
public
void
test
()
{
String
html
=
"<div class=\"edit-comment-hide\">\n"
+
" <div class=\"js-comment-body comment-body markdown-body markdown-format\">\n"
+
" <p>Add more powerful selector for content text extract refered to <a href=\"http://www.elias.cn/En/ExtMainText\">http://www.elias.cn/En/ExtMainText</a></p>\n"
+
" </div>\n"
+
" </div>"
;
TextContentSelector
textContentSelector
=
new
TextContentSelector
(
"<br>"
);
String
text
=
textContentSelector
.
select
(
html
);
Assert
.
assertNotNull
(
text
);
}
@Ignore
(
"takes long time"
)
@Test
public
void
testDownload
()
{
String
s
=
new
HttpClientDownloader
().
download
(
"http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/"
,
"utf-8"
)
.
smartContent
().
text
().
toString
();
Assert
.
assertNotNull
(
s
);
}
}
webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
utils
;
import
org.junit.Test
;
import
static
junit
.
framework
.
Assert
.*;
/**
* @author code4crafter@gmail.com
*/
public
class
EnvironmentUtilTest
{
@Test
public
void
test
()
{
assertTrue
(
EnvironmentUtil
.
useXsoup
());
EnvironmentUtil
.
setUseXsoup
(
false
);
assertFalse
(
EnvironmentUtil
.
useXsoup
());
}
}
webmagic-extension/pom.xml
View file @
e06b0c1a
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<parent>
<parent>
<groupId>
us.codecraft
</groupId>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-parent
</artifactId>
<artifactId>
webmagic-parent
</artifactId>
<version>
0.
2.2
-SNAPSHOT
</version>
<version>
0.
3.0
-SNAPSHOT
</version>
</parent>
</parent>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
View file @
e06b0c1a
...
@@ -34,7 +34,7 @@ class PageModelExtractor {
...
@@ -34,7 +34,7 @@ class PageModelExtractor {
private
List
<
FieldExtractor
>
fieldExtractors
;
private
List
<
FieldExtractor
>
fieldExtractors
;
private
Extractor
e
xtractor
;
private
Extractor
objectE
xtractor
;
public
static
PageModelExtractor
create
(
Class
clazz
)
{
public
static
PageModelExtractor
create
(
Class
clazz
)
{
PageModelExtractor
pageModelExtractor
=
new
PageModelExtractor
();
PageModelExtractor
pageModelExtractor
=
new
PageModelExtractor
();
...
@@ -169,7 +169,7 @@ class PageModelExtractor {
...
@@ -169,7 +169,7 @@ class PageModelExtractor {
annotation
=
clazz
.
getAnnotation
(
ExtractBy
.
class
);
annotation
=
clazz
.
getAnnotation
(
ExtractBy
.
class
);
if
(
annotation
!=
null
)
{
if
(
annotation
!=
null
)
{
ExtractBy
extractBy
=
(
ExtractBy
)
annotation
;
ExtractBy
extractBy
=
(
ExtractBy
)
annotation
;
e
xtractor
=
new
Extractor
(
new
XpathSelector
(
extractBy
.
value
()),
Extractor
.
Source
.
Html
,
extractBy
.
notNull
(),
extractBy
.
multi
());
objectE
xtractor
=
new
Extractor
(
new
XpathSelector
(
extractBy
.
value
()),
Extractor
.
Source
.
Html
,
extractBy
.
notNull
(),
extractBy
.
multi
());
}
}
}
}
...
@@ -183,28 +183,28 @@ class PageModelExtractor {
...
@@ -183,28 +183,28 @@ class PageModelExtractor {
if
(!
matched
)
{
if
(!
matched
)
{
return
null
;
return
null
;
}
}
if
(
e
xtractor
==
null
)
{
if
(
objectE
xtractor
==
null
)
{
return
processSingle
(
page
,
page
.
getHtml
().
toString
()
);
return
processSingle
(
page
,
null
,
false
);
}
else
{
}
else
{
if
(
e
xtractor
.
multi
)
{
if
(
objectE
xtractor
.
multi
)
{
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
Object
>
os
=
new
ArrayList
<
Object
>();
List
<
String
>
list
=
e
xtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
List
<
String
>
list
=
objectE
xtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
for
(
String
s
:
list
)
{
for
(
String
s
:
list
)
{
Object
o
=
processSingle
(
page
,
s
);
Object
o
=
processSingle
(
page
,
s
,
false
);
if
(
o
!=
null
)
{
if
(
o
!=
null
)
{
os
.
add
(
o
);
os
.
add
(
o
);
}
}
}
}
return
os
;
return
os
;
}
else
{
}
else
{
String
select
=
e
xtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
String
select
=
objectE
xtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
Object
o
=
processSingle
(
page
,
select
);
Object
o
=
processSingle
(
page
,
select
,
false
);
return
o
;
return
o
;
}
}
}
}
}
}
private
Object
processSingle
(
Page
page
,
String
html
)
{
private
Object
processSingle
(
Page
page
,
String
html
,
boolean
isRaw
)
{
Object
o
=
null
;
Object
o
=
null
;
try
{
try
{
o
=
clazz
.
newInstance
();
o
=
clazz
.
newInstance
();
...
@@ -213,10 +213,14 @@ class PageModelExtractor {
...
@@ -213,10 +213,14 @@ class PageModelExtractor {
List
<
String
>
value
;
List
<
String
>
value
;
switch
(
fieldExtractor
.
getSource
())
{
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
case
RawHtml:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getHtml
().
toString
());
value
=
page
.
getHtml
().
selectDocumentForList
(
fieldExtractor
.
getSelector
());
break
;
break
;
case
Html:
case
Html:
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
if
(
isRaw
)
{
value
=
page
.
getHtml
().
selectDocumentForList
(
fieldExtractor
.
getSelector
());
}
else
{
value
=
fieldExtractor
.
getSelector
().
selectList
(
html
);
}
break
;
break
;
case
Url:
case
Url:
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
value
=
fieldExtractor
.
getSelector
().
selectList
(
page
.
getUrl
().
toString
());
...
@@ -232,10 +236,14 @@ class PageModelExtractor {
...
@@ -232,10 +236,14 @@ class PageModelExtractor {
String
value
;
String
value
;
switch
(
fieldExtractor
.
getSource
())
{
switch
(
fieldExtractor
.
getSource
())
{
case
RawHtml:
case
RawHtml:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getHtml
().
toString
());
value
=
page
.
getHtml
().
selectDocument
(
fieldExtractor
.
getSelector
());
break
;
break
;
case
Html:
case
Html:
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
if
(
isRaw
)
{
value
=
page
.
getHtml
().
selectDocument
(
fieldExtractor
.
getSelector
());
}
else
{
value
=
fieldExtractor
.
getSelector
().
select
(
html
);
}
break
;
break
;
case
Url:
case
Url:
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
value
=
fieldExtractor
.
getSelector
().
select
(
page
.
getUrl
().
toString
());
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
View file @
e06b0c1a
...
@@ -18,7 +18,7 @@ import java.io.PrintWriter;
...
@@ -18,7 +18,7 @@ import java.io.PrintWriter;
* Otherwise use SHA1 as file name.
* Otherwise use SHA1 as file name.
*
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.
2.2
* @since 0.
3.0
*/
*/
public
class
FilePageModelPipeline
extends
FilePersistentBase
implements
PageModelPipeline
{
public
class
FilePageModelPipeline
extends
FilePersistentBase
implements
PageModelPipeline
{
...
...
webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
utils
;
package
us
.
codecraft
.
webmagic
.
utils
;
import
us.codecraft.webmagic.model.annotation.ExtractBy
;
import
us.codecraft.webmagic.model.annotation.ExtractBy
;
import
us.codecraft.webmagic.selector.CssSelector
;
import
us.codecraft.webmagic.selector.*
;
import
us.codecraft.webmagic.selector.RegexSelector
;
import
us.codecraft.webmagic.selector.Selector
;
import
us.codecraft.webmagic.selector.XpathSelector
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
/**
/**
* Tools for annotation converting. <br>
* Tools for annotation converting. <br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.1
* @since 0.2.1
*/
*/
...
@@ -27,17 +25,27 @@ public class ExtractorUtils {
...
@@ -27,17 +25,27 @@ public class ExtractorUtils {
selector
=
new
RegexSelector
(
value
);
selector
=
new
RegexSelector
(
value
);
break
;
break
;
case
XPath:
case
XPath:
selector
=
new
XpathSelector
(
value
);
selector
=
get
XpathSelector
(
value
);
break
;
break
;
default
:
default
:
selector
=
new
XpathSelector
(
value
);
selector
=
getXpathSelector
(
value
);
}
return
selector
;
}
private
static
Selector
getXpathSelector
(
String
value
)
{
Selector
selector
;
if
(
EnvironmentUtil
.
useXsoup
())
{
selector
=
new
XsoupSelector
(
value
);
}
else
{
selector
=
new
XpathSelector
(
value
);
}
}
return
selector
;
return
selector
;
}
}
public
static
List
<
Selector
>
getSelectors
(
ExtractBy
[]
extractBies
)
{
public
static
List
<
Selector
>
getSelectors
(
ExtractBy
[]
extractBies
)
{
List
<
Selector
>
selectors
=
new
ArrayList
<
Selector
>();
List
<
Selector
>
selectors
=
new
ArrayList
<
Selector
>();
if
(
extractBies
==
null
)
{
if
(
extractBies
==
null
)
{
return
selectors
;
return
selectors
;
}
}
for
(
ExtractBy
extractBy
:
extractBies
)
{
for
(
ExtractBy
extractBy
:
extractBies
)
{
...
...
webmagic-samples/pom.xml
View file @
e06b0c1a
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
<parent>
<parent>
<artifactId>
webmagic-parent
</artifactId>
<artifactId>
webmagic-parent
</artifactId>
<groupId>
us.codecraft
</groupId>
<groupId>
us.codecraft
</groupId>
<version>
0.
2.1
</version>
<version>
0.
3.0-SNAPSHOT
</version>
</parent>
</parent>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.selector.PlainText
;
...
@@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
...
@@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
if
(
page
.
getUrl
().
toString
().
contains
(
"thread"
)){
if
(
page
.
getUrl
().
toString
().
contains
(
"thread"
)){
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//a[@id='thread_subject']"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//a[@id='thread_subject']"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@class='pcb']//tbody"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@class='pcb']//tbody
/tidyText()
"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
regex
(
"发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"
));
page
.
putField
(
"date"
,
page
.
getHtml
().
regex
(
"发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"
));
page
.
putField
(
"id"
,
new
PlainText
(
"1000"
+
page
.
getUrl
().
regex
(
"http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html"
).
toString
()));
page
.
putField
(
"id"
,
new
PlainText
(
"1000"
+
page
.
getUrl
().
regex
(
"http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html"
).
toString
()));
}
}
...
@@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor {
...
@@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor {
}
}
return
site
;
return
site
;
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
DiaoyuwengProcessor
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
import
java.util.List
;
...
@@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor {
...
@@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor {
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
strings
=
page
.
getHtml
().
regex
(
"<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}
"
).
all
();
List
<
String
>
strings
=
page
.
getHtml
().
links
().
regex
(
".*/yewu/.*
"
).
all
();
page
.
addTargetRequests
(
strings
);
page
.
addTargetRequests
(
strings
);
page
.
putField
(
"title"
,
page
.
getHtml
().
regex
(
"<title>(.*)</title>"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
regex
(
"<title>(.*)</title>"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
xpath
(
"//dd
[@class='w133']
"
));
page
.
putField
(
"body"
,
page
.
getHtml
().
xpath
(
"//dd"
));
}
}
@Override
@Override
public
Site
getSite
()
{
public
Site
getSite
()
{
return
Site
.
me
().
setDomain
(
"sh.58.com"
).
addStartUrl
(
"http://sh.58.com/"
);
//To change body of implemented methods use File | Settings | File Templates.
return
Site
.
me
().
setDomain
(
"sh.58.com"
).
addStartUrl
(
"http://sh.58.com/"
);
//To change body of implemented methods use File | Settings | File Templates.
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
F58PageProcesser
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
import
java.util.List
;
...
@@ -14,10 +15,9 @@ import java.util.List;
...
@@ -14,10 +15,9 @@ import java.util.List;
public
class
HuxiuProcessor
implements
PageProcessor
{
public
class
HuxiuProcessor
implements
PageProcessor
{
@Override
@Override
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List
<
String
>
requests
=
page
.
getHtml
().
links
().
regex
(
".*article.*"
).
all
();
List
<
String
>
requests
=
page
.
getHtml
().
regex
(
"<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}"
).
all
();
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='
neirong']//h1[@class='ph xs5']
"
));
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='
clearfix neirong']//h1/text()
"
));
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
page
.
putField
(
"content"
,
page
.
getHtml
().
smartContent
());
}
}
...
@@ -26,4 +26,8 @@ public class HuxiuProcessor implements PageProcessor {
...
@@ -26,4 +26,8 @@ public class HuxiuProcessor implements PageProcessor {
return
Site
.
me
().
setDomain
(
"www.huxiu.com"
).
addStartUrl
(
"http://www.huxiu.com/"
).
return
Site
.
me
().
setDomain
(
"www.huxiu.com"
).
addStartUrl
(
"http://www.huxiu.com/"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
HuxiuProcessor
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
View file @
e06b0c1a
...
@@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils;
...
@@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.RedisScheduler
;
import
java.util.List
;
import
java.util.List
;
...
@@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor {
...
@@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
InfoQMiniBookProcessor
())
Spider
.
create
(
new
InfoQMiniBookProcessor
())
.
scheduler
(
new
RedisScheduler
(
"localhost"
))
.
pipeline
(
new
FilePipeline
(
"/data/temp/webmagic/"
))
.
thread
(
5
)
.
thread
(
5
)
.
run
();
.
run
();
}
}
...
...
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
View file @
e06b0c1a
...
@@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples;
...
@@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.FilePipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
/**
...
@@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor {
...
@@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor {
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
IteyeBlogProcessor
()).
thread
(
5
).
pipeline
(
new
FilePipeline
(
"/data/webmagic/"
)).
run
();
Spider
.
create
(
new
IteyeBlogProcessor
()).
thread
(
5
).
run
();
}
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
/**
...
@@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor {
...
@@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor {
return
Site
.
me
().
setDomain
(
"kaichiba.com"
).
addStartUrl
(
"http://kaichiba.com/shop/41725781"
).
setCharset
(
"utf-8"
).
return
Site
.
me
().
setDomain
(
"kaichiba.com"
).
addStartUrl
(
"http://kaichiba.com/shop/41725781"
).
setCharset
(
"utf-8"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
KaichibaProcessor
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
View file @
e06b0c1a
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
...
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
import
java.util.List
;
...
@@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor {
...
@@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor {
}
}
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
requests
);
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(.*/restaurant/[^#]+)"
).
all
());
page
.
addTargetRequests
(
page
.
getHtml
().
links
().
regex
(
"(.*/restaurant/[^#]+)"
).
all
());
page
.
putField
(
"items"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"
));
page
.
putField
(
"items"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]
/text()
"
));
page
.
putField
(
"prices"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"
));
page
.
putField
(
"prices"
,
page
.
getHtml
().
xpath
(
"//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]
/text()
"
));
}
}
@Override
@Override
...
@@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor {
...
@@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor {
return
Site
.
me
().
setDomain
(
"meican.com"
).
addStartUrl
(
"http://www.meican.com/shanghai/districts"
).
setCharset
(
"utf-8"
).
return
Site
.
me
().
setDomain
(
"meican.com"
).
addStartUrl
(
"http://www.meican.com/shanghai/districts"
).
setCharset
(
"utf-8"
).
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
setUserAgent
(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"
);
}
}
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
MeicanProcessor
()).
run
();
}
}
}
webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
samples
;
package
us
.
codecraft
.
webmagic
.
samples
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.util.List
;
import
java.util.List
;
...
@@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor {
...
@@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor {
public
void
process
(
Page
page
)
{
public
void
process
(
Page
page
)
{
List
<
String
>
links
=
page
.
getHtml
().
links
().
regex
(
"http://my\\.oschina\\.net/flashsword/blog/\\d+"
).
all
();
List
<
String
>
links
=
page
.
getHtml
().
links
().
regex
(
"http://my\\.oschina\\.net/flashsword/blog/\\d+"
).
all
();
page
.
addTargetRequests
(
links
);
page
.
addTargetRequests
(
links
);
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"
).
toString
());
page
.
putField
(
"title"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1
/text()
"
).
toString
());
page
.
putField
(
"content"
,
page
.
getHtml
().
$
(
"div.content
"
).
toString
());
page
.
putField
(
"content"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogContent']/tidyText()
"
).
toString
());
page
.
putField
(
"tags"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogTags']/a/text()"
).
all
());
page
.
putField
(
"tags"
,
page
.
getHtml
().
xpath
(
"//div[@class='BlogTags']/a/text()"
).
all
());
}
}
...
@@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
...
@@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
Spider
.
create
(
new
OschinaBlogPageProcesser
()).
pipeline
(
new
ConsolePipeline
()).
run
();
Spider
.
create
(
new
OschinaBlogPageProcesser
()).
run
();
}
}
}
}
webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java
0 → 100644
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
model
;
import
org.junit.Test
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.model.samples.OschinaBlog
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.PlainText
;
/**
* @author code4crafter@gmail.com
*/
public
class
ProcessorBenchmark
{
@Test
public
void
test
()
{
ModelPageProcessor
modelPageProcessor
=
ModelPageProcessor
.
create
(
Site
.
me
().
addStartUrl
(
"http://my.oschina.net/flashsword/blog"
),
OschinaBlog
.
class
);
Page
page
=
new
Page
();
page
.
setRequest
(
new
Request
(
"http://my.oschina.net/flashsword/blog"
));
page
.
setUrl
(
new
PlainText
(
"http://my.oschina.net/flashsword/blog"
));
page
.
setHtml
(
new
Html
(
html
));
long
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
modelPageProcessor
.
process
(
page
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()
-
time
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
modelPageProcessor
.
process
(
page
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()
-
time
);
}
private
String
html
=
"\n"
+
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
+
"<html lang='zh-CN' xml:lang='zh-CN' xmlns='http://www.w3.org/1999/xhtml'>\n"
+
"<head>\n"
+
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>\n"
+
" <meta http-equiv=\"Content-Language\" content=\"zh-CN\"/>\n"
+
" <meta name=\"robots\" content=\"index, follow\" />\n"
+
" <link rel=\"shortcut icon\" type=\"image/x-icon\" href=\"/img/favicon.ico\" />\n"
+
" <title>Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区</title>\n"
+
" <meta name=\"Keywords\" content=\"Jsoup,XSS,OO\"/>\n"
+
" <meta name=\"Description\" content=\"Jsoup代码解读之八-防御XSS攻击:![hacker][1] ## 防御XSS攻击的一般原理 cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的...\"/>\n"
+
" <link rel=\"stylesheet/less\" href=\"http://my.oschina.net/flashsword/styles.less?ver=20130608&date=20130524070359\" type=\"text/css\" media=\"screen\" />\n"
+
" <link rel=\"stylesheet\" href=\"/js/2012/poshytip/tip-yellowsimple/tip-yellowsimple.css\" type=\"text/css\" />\n"
+
" <link rel=\"stylesheet\" type=\"text/css\" href=\"/js/2011/fancybox/jquery.fancybox-1.3.4.css\" media=\"screen\" />\n"
+
" <script type=\"text/javascript\" src=\"/js/2012/jquery-1.7.1.min.js\"></script>\n"
+
" <script type=\"text/javascript\" src=\"/js/2012/jquery.form.js\"></script>\n"
+
" <script type=\"text/javascript\" src=\"/js/2011/fancybox/jquery.fancybox-1.3.4.pack.js\"></script>\n"
+
" <script type=\"text/javascript\" src=\"/js/2012/poshytip/jquery.poshytip.min.js\"></script>\n"
+
" <script type=\"text/javascript\" src=\"/js/2011/oschina.js?ver=20121007\"></script>\n"
+
" <script type=\"text/javascript\" src=\"/js/2012/less-1.3.0.min.js\"></script>\n"
+
" <script type=\"text/javascript\" src=\"/js/scrolltopcontrol.js\"></script>\n"
+
" <script type='text/javascript' src='/js/jquery/jquery.atwho.js'></script>\n"
+
" <link rel=\"stylesheet\" type=\"text/css\" href=\"/js/jquery/jquery.atwho.css\" />\n"
+
" <link rel=\"alternate\" type=\"application/rss+xml\" title=\"黄亿华最新博客\" href=\"http://my.oschina.net/flashsword/rss\" />\n"
+
" <link rel=\"EditURI\" type=\"application/rsd+xml\" title=\"RSD\" href=\"http://my.oschina.net/action/xmlrpc/rsd?space=190591\" />\n"
+
" <link rel=\"wlwmanifest\" type=\"application/wlwmanifest+xml\" href=\"http://my.oschina.net/action/xmlrpc/wlwmanifest?space=190591\" /> \n"
+
" <style type=\"text/css\">\n"
+
" body,table,input,textarea,select {font-family:Verdana,sans-serif,宋体;}\t\n"
+
" </style>\n"
+
" <script type=\"text/javascript\">\n"
+
" \tscrolltotop.offset(100,165);\n"
+
"\tscrolltotop.init();\n"
+
" </script>\n"
+
"</head>\n"
+
"<body>\n"
+
"<div id=\"OSC_Screen\">\n"
+
"\t<div id='OSC_Banner'>\n"
+
"\t\t<div id=\"OSC_Logo\">\n"
+
" \t<a href=\"http://www.oschina.net/\" title=\"开源中国社区首页\">开源中国社区</a>\n"
+
" </div>\n"
+
" <div id='OSC_Slogon'>开源项目发现、使用和交流平台</div>\n"
+
"\t\t <div id=\"OSC_Channels\">\n"
+
" \t<ul>\n"
+
" \t<li><a href=\"http://www.oschina.net/project\" class='software'>项目</a></li>\n"
+
" \t<li><a href=\"http://www.oschina.net/question\" class='question'>讨论</a></li>\n"
+
" \t<li><a href=\"http://www.oschina.net/code/list\" class='code'>代码</a></li>\n"
+
" \t<li><a href=\"http://www.oschina.net/news\" class='news'>资讯</a></li>\n"
+
" \t<li><a href=\"http://www.oschina.net/translate\" class='translate'>翻译</a></li>\n"
+
" \t<li><a href=\"http://www.oschina.net/blog\" class='blog'>博客</a></li>\n"
+
" \t<li><a href=\"http://www.oschina.net/android\" class='android'>Android</a></li>\n"
+
" \t<li><a href=\"http://www.oschina.net/job\" class='job'>招聘</a></li>\n"
+
" \t</ul>\n"
+
" </div>\n"
+
" <div class='clear'></div>\n"
+
"\t</div>\n"
+
"\t<div id=\"OSC_Topbar\">\n"
+
"\t\t<div id=\"VisitorInfo\">\n"
+
"\t\t当前访客身份:\n"
+
"\t\t\t\t黄亿华 [ <a href=\"/action/user/logout?session=6db40e6e2d1061998068&goto_page=http%3A%2F%2Fmy.oschina.net%2Fflashsword\">退出</a> ]\n"
+
"\t\t\t\t<span id=\"OSC_Notification\">\t\t\t\n"
+
"\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<a href=\"http://my.oschina.net/flashsword/admin/inbox\" class=\"msgbox\" title=\"进入我的留言箱\">你有<em>0</em>新留言</a>\t\t\t\n"
+
"\t\t\t\t\t\t\t\t\t\t\t</span>\n"
+
"\t\t</div>\n"
+
"\t\t<div id=\"SearchBar\">\n"
+
" \t\t<form action=\"http://www.oschina.net/search\">\n"
+
"\t\t\t\t<input type='hidden' name='user' value='190591'/>\n"
+
"\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<span class=\"ipt f_l\">\n"
+
" \t\t\t<input type='text' id='txt_q' name='q' class='SERACH' value='在 26755 款开源软件中搜索' onblur=\"(this.value=='')?this.value='在 26755 款开源软件中搜索':this.value\" onfocus=\"if(this.value=='在 26755 款开源软件中搜索'){this.value='';};this.select();\"/>\n"
+
"\t\t\t\t</span>\n"
+
"\t\t\t\t <div class=\"search-by selectbox\">\n"
+
" \t\t\t\t<span class=\"hide\">\n"
+
" \t\t\t\t<select name='scope'>\t\t\t\t\t\n"
+
" <option value='project' selected>软件</option>\n"
+
" <option value='code'>代码</option>\n"
+
" <option value='bbs'>讨论区</option>\n"
+
" <option value='news'>新闻</option>\n"
+
" <option value='blog'>博客</option>\n"
+
" \t\t\t\t</select>\n"
+
" \t\t\t\t</span>\n"
+
" <div class=\"search_on\" id=\"search-item\"><span class=\"text\">软件</span></div>\n"
+
" <ul class=\"search_list\">\n"
+
" <li class=\"search-item\"><a href=\"#1\">软件</a></li>\n"
+
" <li><a href=\"#2\">代码</a></li>\n"
+
" <li><a href=\"#3\">讨论区</a></li>\n"
+
" <li><a href=\"#4\">新闻</a></li>\n"
+
" <li><a href=\"#5\">博客</a></li>\n"
+
" </ul>\n"
+
" </div>\n"
+
"\t\t\t\t<input type='submit' value='搜索' class='bnt f_r'/>\t\t\t\n"
+
" \t\t</form>\n"
+
"\t\t</div>\n"
+
"\t\t<div class='clear'></div>\n"
+
"\t</div>\n"
+
"\t<div id=\"OSC_Content\">\t\n"
+
"\n"
+
"<div id='SpaceLeft'>\n"
+
"<div class='Owner'>\n"
+
"\t\t<a href='http://my.oschina.net/flashsword/admin/user-settings?tab=3' title='切换空间风格' class='ThemeSetting'>切换风格</a> <a href=\"http://my.oschina.net/flashsword\" class='Img'><img src=\"http://static.oschina.net/uploads/user/95/190591_100.jpg?t=1347254905000\" align=\"absmiddle\" alt=\"黄亿华\" title=\"黄亿华\" class=\"LargePortrait\"/></a>\n"
+
" <span class='U'>\n"
+
" <a href=\"http://my.oschina.net/flashsword\" class='Name' title='男'>黄亿华</a>\n"
+
"\t\t<span class='opts'>\n"
+
"\t\t\t<img src=\"/img/2012/men.png\" align='absmiddle' title='男'/>\n"
+
" \t\t\t<a href=\"http://my.oschina.net/flashsword/admin/profile\">修改资料</a>\n"
+
"\t\t\t<a href=\"http://my.oschina.net/flashsword/admin/portrait\">更换头像</a>\n"
+
" \t\t</span>\n"
+
" </span>\n"
+
" <div class='clear'></div>\n"
+
" <div class='stat'>\n"
+
" \t<a href=\"http://my.oschina.net/flashsword/fellow\">关注(43)</a>\n"
+
" \t<a href=\"http://my.oschina.net/flashsword/fans\">粉丝(98)</a>\n"
+
" \t<a href=\"http://www.oschina.net/question/3307_20931\" title=\"查看OSCHINA积分规则\">积分(173)</a>\n"
+
" </div>\n"
+
"</div><style>\n"
+
"#MyResume textarea {width:170px;height:60px;font-size:9pt;}\n"
+
"</style>\n"
+
"<div class='Resume' id='MyResume'>\n"
+
"码农一枚<br/>实用主义者<br/>抵制重复造轮子,却造了不少轮子<br/>http://codecraft.us</div>\n"
+
"<script type=\"text/javascript\" src=\"/js/2012/jquery.editinplace.js\"></script>\n"
+
"<script type=\"text/javascript\">\n"
+
"$(\"#MyResume\").editInPlace({\n"
+
" url: \"/action/profile/update_user_signature?user_code=tzm9Wg2YoU8SkJaTIjHQkahStiXQNyymUGXFOQgN\",\n"
+
"\tbg_over: \"none\",\n"
+
"\tbg_out: \"none\",\n"
+
" field_type: \"textarea\",\n"
+
"\tvalue_required: \"true\",\n"
+
"\terror: function(){\n"
+
"\t\talert(\"修改个人简介失败\");\n"
+
"\t}\n"
+
"});\n"
+
"</script>\n"
+
"\n"
+
"<div class='Opts clearfix'>\n"
+
"\t<a href=\"http://my.oschina.net/flashsword/admin/new-blog\" class='a1 blog'><i>.</i><span>发表博文</span></a>\n"
+
"\t<a href=\"http://my.oschina.net/flashsword/admin\" class='a2 admin'><i>.</i><span>空间管理</span></a>\n"
+
"</div><div class=\"Mod\" id=\"BlogCatalogs\">\n"
+
" <strong><a href=\"http://my.oschina.net/flashsword/admin/blog-catalogs\" class=\"more\">管理»</a> 博客分类</strong>\n"
+
" <ul>\n"
+
"\t\t\t<li class='draft'><a href=\"http://my.oschina.net/flashsword/admin/drafts\">草稿箱</a><span>(4)</span></li>\n"
+
"\t \t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=371362\">webmagic</a><span>(16)</span></li>\n"
+
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=380473\">分布式消息系统</a><span>(5)</span></li>\n"
+
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=285504\">探耽求究</a><span>(5)</span></li>\n"
+
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=368513\">BlackHoleJ</a><span>(21)</span></li>\n"
+
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=368514\">Intellij</a><span>(4)</span></li>\n"
+
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=112331\">工作日志</a><span>(7)</span></li>\n"
+
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=112332\">日常记录</a><span>(4)</span></li>\n"
+
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=261044\">codecraft</a><span>(1)</span></li>\n"
+
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=279271\">开发日记</a><span>(3)</span></li>\n"
+
"\t </ul>\n"
+
"</div><div class=\"Mod\" id=\"HotBlogs\">\n"
+
" <strong>阅读排行</strong>\n"
+
" <ol>\n"
+
"\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/145796\">1. webmagic的设计机制及原理-如何开发一个Java爬虫</a></li>\n"
+
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/143028\">2. monkeysocks开发日志--TCP协议分析及架构规划</a></li>\n"
+
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/156638\">3. 【整理】国内一些大公司的开源项目</a></li>\n"
+
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/110276\">4. BlackHole开发日志--防止DNS污染</a></li>\n"
+
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/158200\">5. Jsoup代码解读之八-防御XSS攻击</a></li>\n"
+
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/123505\">6. IntelliJ IDEA使用心得</a></li>\n"
+
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/80037\">7. 关于HTTP keep-alive的实验</a></li>\n"
+
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/152263\">8. 分布式消息系统研究报告之Kafka</a></li>\n"
+
"\t\t </ol>\n"
+
"</div>\n"
+
"<div class=\"Mod\" id=\"BlogReplies\">\n"
+
" <strong><a href=\"http://my.oschina.net/flashsword/admin/blog-comments\" class=\"more\">管理»</a> 最新评论</strong> \n"
+
" <ul>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“lidongyang”的评论 引用来自“黄亿华...\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275640366&type=18&user=190591\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/lidongyang\">@lidongyang</a>:引用来自“黄亿华”的评论 引用来自“lidongyan...\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275640301&type=18&user=723383\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“lidongyang”的评论 引用来自“黄亿华...\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275638563&type=18&user=190591\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/lidongyang\">@lidongyang</a>:引用来自“黄亿华”的评论 引用来自“lidongyan...\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275638070&type=18&user=723383\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“searchjack”的评论 不是好的就会被认...\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275617319&type=18&user=190591\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/searchjack\">@searchjack</a>:不是好的就会被认可, 干自己的, 到时候, 单干\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275617235&type=18&user=234880\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/searchjack\">@searchjack</a>:极好的工具,\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275616963&type=18&user=234880\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275599170&type=18&user=190591\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/rox\">@静风流云</a>:貌似,OSC也是类似处理的。\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275599137&type=18&user=180\">查看»</a>\n"
+
"\t</li>\n"
+
"\t\t<li>\n"
+
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n"
+
"\t\t<a href=\"/action/tweet/go?obj=275570030&type=18&user=190591\">查看»</a>\n"
+
"\t</li>\n"
+
"\t </ul>\n"
+
" </div>\n"
+
"<div class='Mod' id='Stat'>\n"
+
"<strong>访客统计</strong>\n"
+
"<ul>\n"
+
"\t<li><label>今日访问:</label>6 (<a href=\"http://my.oschina.net/flashsword/visitors\">查看最新访客»</a>)</li>\n"
+
" <li><label>昨日访问:</label>284</li>\n"
+
" <li><label>本周访问:</label>817</li>\n"
+
" <li><label>本月访问:</label>1888</li>\n"
+
" <li><label>所有访问:</label>16453</li>\n"
+
"</ul>\n"
+
"</div></div>\n"
+
"\n"
+
"<div class='SpaceList'>\n"
+
"\t<div class='TopBar'>\n"
+
" \t<div class='NavPath'>\t\t\n"
+
" \t\t<a href='http://my.oschina.net/flashsword'>空间</a> » <a href='http://my.oschina.net/flashsword/blog'>博客</a>\t\t\t\n"
+
"\t\t\t» <a href=\"http://my.oschina.net/flashsword/blog?catalog=371362\">webmagic</a>\n"
+
"\t\t\t» 博客正文\n"
+
" \t</div>\n"
+
"\t</div>\n"
+
"\t\n"
+
" \t<div class='BlogEntity'>\t\t\n"
+
" <div class='BlogTitle'>\n"
+
" <h1><img src='/img/space/b1.gif' align='absmiddle'/> Jsoup代码解读之八-防御XSS攻击</h1>\n"
+
" <div class='BlogStat'>\n"
+
" \t\t \t\t \t\t<span class='admin'>\n"
+
" \t\t\t<a href=\"http://my.oschina.net/flashsword/admin/edit-blog?blog=158200\">编辑</a> | <a href=\"javascript:delete_blog(158200)\">删除</a>\n"
+
" \t\t</span>\n"
+
"\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n"
+
" \t\t已有<strong>1628</strong>次阅读 ,共<strong><a href=\"#comments\">3</a></strong>个评论\n"
+
" \t\t\t\t\t,共 <strong>79</strong> 人收藏此文 \t</div> \n"
+
" </div>\n"
+
"\t \t <div class=\"BlogAnchor\">\n"
+
" <p>目录:[ <strong><a href=\"#\" id=\"AnchorContentToggle\" title=\"收起\">-</a></strong> ]</p>\n"
+
" <div class=\"AnchorContent\" id=\"AnchorContent\"><li class='osc_h2'><a href='#OSC_h2_1'>防御XSS攻击的一般原理</a></li><li class='osc_h2'><a href='#OSC_h2_2'>Cleaner与Whitelist</a></li><li class='osc_h2'><a href='#OSC_h2_3'>结束语</a></li></div>\n"
+
" \t </div>\n"
+
" <script>\n"
+
"\t\t \t$(function(){\n"
+
"\t\t\t\t$(\"#AnchorContentToggle\").click(function(){\n"
+
"\t\t\t\t\tvar text = $(this).html();\n"
+
"\t\t\t\t\tif(text==\"-\"){\n"
+
"\t\t\t\t\t\t$(this).html(\"+\");\n"
+
"\t\t\t\t\t\t$(this).attr({\"title\":\"展开\"});\n"
+
"\t\t\t\t\t}else{\n"
+
"\t\t\t\t\t\t$(this).html(\"-\");\n"
+
"\t\t\t\t\t\t$(this).attr({\"title\":\"收起\"});\n"
+
"\t\t\t\t\t}\n"
+
"\t\t\t\t\t$(\"#AnchorContent\").toggle();\n"
+
"\t\t\t\t});\n"
+
"\t\t\t});\n"
+
"\t\t </script>\n"
+
"\t \t <div class='BlogContent'><p><img src=\"http://static.oschina.net/uploads/space/2013/0831/071752_RBZc_190591.png\" /></p> \n"
+
"<span id=\"OSC_h2_1\"></span>\n"
+
"<h2>防御XSS攻击的一般原理</h2> \n"
+
"<p>cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。</p> \n"
+
"<p>我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<code><</code>,<code>></code>,<code>"</code>,<code>'</code>是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。</p> \n"
+
"<p>在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:</p> \n"
+
"<ol> \n"
+
" <li><p>将HTML解析为DOM树</p> <p>这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用<code></textarea></code>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。</p></li> \n"
+
" <li><p>过滤高风险标签/属性/属性值</p> <p>高风险标签是指<code><script></code>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如<code>onclick='alert("xss!")'</code>。</p></li> \n"
+
" <li><p>重新将DOM树输出为HTML文本</p> <p>DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。</p></li> \n"
+
"</ol> \n"
+
"<span id=\"OSC_h2_2\"></span>\n"
+
"<h2>Cleaner与Whitelist</h2> \n"
+
"<p>对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。</p> \n"
+
"<p>Jsoup给出的答案是白名单。下面是<code>Whitelist</code>的部分代码。</p> \n"
+
"<pre class=\"brush: java\">public class Whitelist {\n"
+
" private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n"
+
" private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n"
+
" private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n"
+
" private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n"
+
" private boolean preserveRelativeLinks; // option to preserve relative links\n"
+
"}</pre> \n"
+
"<p>这里定义了标签名/属性名/属性值的白名单。</p> \n"
+
"<p>而<code>Cleaner</code>是过滤的执行者。不出所料,Cleaner内部定义了<code>CleaningVisitor</code>来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到<code>Element destination</code>里去。</p> \n"
+
"<pre class=\"brush: java\">private final class CleaningVisitor implements NodeVisitor {\n"
+
" private int numDiscarded = 0;\n"
+
" private final Element root;\n"
+
" private Element destination; // current element to append nodes to\n"
+
"\n"
+
" private CleaningVisitor(Element root, Element destination) {\n"
+
" this.root = root;\n"
+
" this.destination = destination;\n"
+
" }\n"
+
"\n"
+
" public void head(Node source, int depth) {\n"
+
" if (source instanceof Element) {\n"
+
" Element sourceEl = (Element) source;\n"
+
"\n"
+
" if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n"
+
" ElementMeta meta = createSafeElement(sourceEl);\n"
+
" Element destChild = meta.el;\n"
+
" destination.appendChild(destChild);\n"
+
"\n"
+
" numDiscarded += meta.numAttribsDiscarded;\n"
+
" destination = destChild;\n"
+
" } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n"
+
" numDiscarded++;\n"
+
" }\n"
+
" } else if (source instanceof TextNode) {\n"
+
" TextNode sourceText = (TextNode) source;\n"
+
" TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n"
+
" destination.appendChild(destText);\n"
+
" } else { // else, we don't care about comments, xml proc instructions, etc\n"
+
" numDiscarded++;\n"
+
" }\n"
+
" }\n"
+
"\n"
+
" public void tail(Node source, int depth) {\n"
+
" if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n"
+
" destination = destination.parent(); // would have descended, so pop destination stack\n"
+
" }\n"
+
" }\n"
+
"}</pre> \n"
+
"<span id=\"OSC_h2_3\"></span>\n"
+
"<h2>结束语</h2> \n"
+
"<p>至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:</p> \n"
+
"<ul> \n"
+
" <li><p>最好的代码抽象,是对现实概念的映射。</p> <p>这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。</p></li> \n"
+
" <li><p>不要过度抽象</p> <p>在Jsoup里,只用到了两个接口,一个是<code>NodeVisitor</code>,一个是<code>Connection</code>,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。</p> <p>另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。</p></li> \n"
+
"</ul> \n"
+
"<p>最后继续贴上Jsoup解读系列的github地址:<a href=\"https://github.com/code4craft/jsoup-learning/\" rel=\"nofollow\">https://github.com/code4craft/jsoup-learning/</a></p></div>\n"
+
" \t \t \n"
+
" \t\n"
+
"\t <div class='BlogTags'>\n"
+
" \t<strong>关键字:</strong>\n"
+
" \t \t<a href=\"http://www.oschina.net/search?scope=blog&q=Jsoup\" class=\"tag\">Jsoup</a>\n"
+
" \t \t<a href=\"http://www.oschina.net/search?scope=blog&q=XSS\" class=\"tag\">XSS</a>\n"
+
" \t \t<a href=\"http://www.oschina.net/search?scope=blog&q=OO\" class=\"tag\">OO</a>\n"
+
" \t \t </div>\n"
+
"\t \t \n"
+
" <div class='BlogCopyright'>\t\t\n"
+
"\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n"
+
"\t \t </div>\n"
+
"\n"
+
" <div class='BlogLinks'>\n"
+
" \t<ul>\n"
+
" <li class='prev'><a href=\"http://my.oschina.net/flashsword/blog/158171\" title=\"上一篇:Jsoup代码解读之七-实现一个CSS Selector\">« Jsoup代码解读之七-实现一个CSS Selector</a></li> \t</ul>\n"
+
"\t\t </div>\n"
+
"\t</div>\n"
+
"\n"
+
"\t<style type='text/css'>\n"
+
"\t#BlogShare strong{float:left;padding-top:10px;font-size:11pt;color:#444;}\n"
+
"\t#BlogShare a.share_sina{float:left;width:32px;height:32px;background:url('/img/icon01.gif') center no-repeat;}\n"
+
"\t#BlogShare a.share_qq{float:left;width:32px;height:32px;margin-left: 10px;background:url('/img/icon02.gif') center no-repeat;}\n"
+
"\t</style>\n"
+
"\t<div class='BlogShare'>\n"
+
"\t\n"
+
"\t<span id='BlogShare'>\n"
+
"\t\t<strong>分享到: </strong>\n"
+
"\t\t<a class=\"share_sina\" title=\"分享到新浪微博\" href=\"javascript:void((function(s,d,e,r,l,p,t,z,c){var%20f='http://v.t.sina.com.cn/share/share.php?appkey=858381728',u=z||d.location,p=['&url=',e(u),'&title=',e(t||d.title),'&source=',e(r),'&sourceUrl=',e(l),'&content=',c||'gb2312','&pic=',e(p||'')].join('');function%20a(){if(!window.open([f,p].join(''),'mb',['toolbar=0,status=0,resizable=1,width=440,height=430,left=',(s.width-440)/2,',top=',(s.height-430)/2].join('')))u.href=[f,p].join('');};if(/Firefox/.test(navigator.userAgent))setTimeout(a,0);else%20a();})(screen,document,encodeURIComponent,'','','','Jsoup代码解读之八-防御XSS攻击: 防御XSS攻击的一般原理 cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。 我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,...','','utf-8'));\"></a>\n"
+
"\t\t<a class=\"share_qq\" title=\"分享到腾讯微博\" href=\"javascript:(function(){window.open('http://v.t.qq.com/share/share.php?url='+encodeURIComponent(document.location)+'&appkey=96f54f97c4de46e393c4835a266207f4&site=&title='+encodeURIComponent(document.title)+encodeURIComponent(': 防御XSS攻击的一般原理 cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。 我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,...'),'', 'width=450, height=400, top=0, left=0, toolbar=no, menubar=no, scrollbars=no, location=yes, resizable=no, status=no');}())\"></a></span>\n"
+
"\t<span id='BlogVote'>\n"
+
" <a href=\"javascript:vote(158200)\">顶</a><span>已有 <em id='vote_count'>0</em>人顶</span>\n"
+
"\t</span>\n"
+
"\t</div>\n"
+
"\t\t\n"
+
"</div>\n"
+
"<div class='SpaceList' style='margin-top:20px;'>\n"
+
"<div class='BlogComments'>\n"
+
" <h2><a name=\"comments\"></a>共有 3 条网友评论</h2>\n"
+
"\t\t\t<ul id=\"BlogComments\">\n"
+
"\t\t\t\t\t\t<li id='cmt_158200_180_275599137'>\n"
+
"\t<table class='ostable'><tr>\n"
+
"\t<td class='portrait'>\n"
+
"\t\t<a href=\"http://my.oschina.net/rox\" target=\"_blank\"><img src=\"http://static.oschina.net/uploads/user/0/180_50.jpg?t=1367919013000\" align=\"absmiddle\" alt=\"静风流云\" title=\"静风流云\" class=\"SmallPortrait\" user=\"180\"/></a>\t\t\t\n"
+
"\t</td>\n"
+
"\t<td class='body'>\n"
+
"\t\t<div class='title'>\n"
+
"\t\t\t1楼:<a href=\"http://my.oschina.net/rox\" target=\"_blank\" name=\"rpl_275599137\">静风流云</a> 发表于 2013-09-01 08:34 \t\t\t\n"
+
" \t \t <a href=\"javascript:delete_c(158200,180,275599137)\">删除</a>\n"
+
"\t\t\t\t\t\t\t\t\t <a href=\"javascript:ReplyInline(158200,180,275599137)\">回复此评论</a>\n"
+
"\t\t\t\t\t</div>\n"
+
"\t\t<div class='post'\">貌似,OSC也是类似处理的。</div>\n"
+
"\t\t<div id='inline_reply_of_158200_180_275599137' class='inline_reply'></div>\n"
+
" </td>\n"
+
"\t</tr></table>\n"
+
"</li>\t\t\t\t\t<li id='cmt_158200_190591_275599170'>\n"
+
"\t<table class='ostable'><tr>\n"
+
"\t<td class='portrait'>\n"
+
"\t\t<a href=\"http://my.oschina.net/flashsword\" target=\"_blank\"><img src=\"http://static.oschina.net/uploads/user/95/190591_50.jpg?t=1347254905000\" align=\"absmiddle\" alt=\"黄亿华\" title=\"黄亿华\" class=\"SmallPortrait\" user=\"190591\"/></a>\t\t\t\n"
+
"\t</td>\n"
+
"\t<td class='body'>\n"
+
"\t\t<div class='title'>\n"
+
"\t\t\t2楼:<a href=\"http://my.oschina.net/flashsword\" target=\"_blank\" name=\"rpl_275599170\">黄亿华</a> 发表于 2013-09-01 08:37 \t\t\t\n"
+
" \t \t <a href=\"javascript:delete_c(158200,190591,275599170)\">删除</a>\n"
+
"\t\t\t\t\t\t\t\t</div>\n"
+
"\t\t<div class='post'\"><div class=ref><h4>引用来自“静风流云”的评论</h4><p>貌似,OSC也是类似处理的。</p></div>OSC就是使用Jsoup做解析的,见这里:<a href='http://www.oschina.net/p/jsoup' rel='nofollow' target='_blank'>http://www.oschina.net/p/jsoup</a></div>\n"
+
"\t\t<div id='inline_reply_of_158200_190591_275599170' class='inline_reply'></div>\n"
+
" </td>\n"
+
"\t</tr></table>\n"
+
"</li>\t\t\t\t\t<li id='cmt_158200_234880_275616963'>\n"
+
"\t<table class='ostable'><tr>\n"
+
"\t<td class='portrait'>\n"
+
"\t\t<a href=\"http://my.oschina.net/searchjack\" target=\"_blank\"><img src=\"http://static.oschina.net/uploads/user/117/234880_50.jpg?t=1362718646000\" align=\"absmiddle\" alt=\"searchjack\" title=\"searchjack\" class=\"SmallPortrait\" user=\"234880\"/></a>\t\t\t\n"
+
"\t</td>\n"
+
"\t<td class='body'>\n"
+
"\t\t<div class='title'>\n"
+
"\t\t\t3楼:<a href=\"http://my.oschina.net/searchjack\" target=\"_blank\" name=\"rpl_275616963\">searchjack</a> 发表于 2013-09-02 09:20 \t\t\t\n"
+
" \t \t <a href=\"javascript:delete_c(158200,234880,275616963)\">删除</a>\n"
+
"\t\t\t\t\t\t\t\t\t <a href=\"javascript:ReplyInline(158200,234880,275616963)\">回复此评论</a>\n"
+
"\t\t\t\t\t</div>\n"
+
"\t\t<div class='post'\">极好的工具,</div>\n"
+
"\t\t<div id='inline_reply_of_158200_234880_275616963' class='inline_reply'></div>\n"
+
" </td>\n"
+
"\t</tr></table>\n"
+
"</li>\t\t\t\t</ul>\n"
+
"</div>\n"
+
"\t</div>\n"
+
"\n"
+
"<div id='inline_reply_editor' style='display:none;'>\n"
+
"<div class=\"BlogCommentForm\">\n"
+
"\t<form id=\"form_inline_comment\" action=\"/action/blog/add_comment?blog=158200\" method=\"POST\">\n"
+
"\t <input type='hidden' id='inline_reply_id' name='reply_id' value=''/> \n"
+
" <textarea name=\"content\" style=\"width:550px;height:60px;\" onkeydown=\"if((event.metaKey || event.ctrlKey)&&event.keyCode==13){$('#form_inline_comment').submit();}\"></textarea><br/>\n"
+
"\t <input type=\"submit\" value=\"回复\" id=\"btn_comment\" class=\"SUBMIT\"/> \n"
+
"\t <input type=\"button\" value=\"关闭\" class=\"SUBMIT\" id='btn_close_inline_reply'/> 文明上网,理性发言\n"
+
" </form>\n"
+
"</div>\n"
+
"</div>\n"
+
"<div class='SpaceList' style='margin-top:20px;'>\n"
+
" <a name=\"comments\" id=\"postform\"></a>\n"
+
" <div class=\"BlogCommentForm\">\n"
+
" <form id=\"form_comment\" action=\"/action/blog/add_comment?blog=158200\" method=\"POST\"> \n"
+
" <textarea id='ta_post_content' name=\"content\" style=\"width:550px;height:100px;\" onkeydown=\"if((event.metaKey || event.ctrlKey)&&event.keyCode==13){$('#form_comment').submit();}\"></textarea><br/>\n"
+
"\t <input type=\"submit\" value=\"发表评论\" id=\"btn_comment\" class=\"SUBMIT\" /> \n"
+
"\t <img id=\"submiting\" style=\"display:none\" src=\"/img/loading.gif\" align=\"absmiddle\"/>\n"
+
"\t <span id='cmt_tip'>文明上网,理性发言</span>\n"
+
" </form>\n"
+
"\t<a href=\"#\" class=\"more\">回到页首</a> | <a href=\"#comments\" class=\"more\">回到评论列表</a>\n"
+
" </div>\n"
+
" </div>\n"
+
"\t\n"
+
"<div id=\"RelativeBlogs\">\n"
+
"\t<strong><a id='btn_close'>关闭</a>相关文章阅读</strong>\n"
+
"\t<ul>\n"
+
"\t\t\t<li>\n"
+
"\t\t<span class='date'>2012/04/04</span>\n"
+
"\t\t<a href=\"http://my.oschina.net/soitravel/blog/52366\" title=\"oo原则\">oo原则</a>\n"
+
"\t</li>\n"
+
"\t\t\t\t<li>\n"
+
"\t\t<span class='date'>2012/09/03</span>\n"
+
"\t\t<a href=\"http://my.oschina.net/wangfree/blog/76273\" title=\"XSS跨站脚本攻击\">XSS跨站脚本攻击</a>\n"
+
"\t</li>\n"
+
"\t\t\t\t<li>\n"
+
"\t\t<span class='date'>2012/10/10</span>\n"
+
"\t\t<a href=\"http://my.oschina.net/samshuai/blog/82382\" title=\"《蟋蟀的xss淫荡教程之如何劫持OSC用户账号》\">《蟋蟀的xss淫荡教程之如何劫持OSC...</a>\n"
+
"\t</li>\n"
+
"\t\t\t\t<li>\n"
+
"\t\t<span class='date'>2013/06/08</span>\n"
+
"\t\t<a href=\"http://my.oschina.net/tdoly/blog/136632\" title=\"[Security]XSS一直是个棘手的问题\">[Security]XSS一直是个棘手的问题...</a>\n"
+
"\t</li>\n"
+
"\t\t\t\t<li>\n"
+
"\t\t<span class='date'>2013/01/05</span>\n"
+
"\t\t<a href=\"http://my.oschina.net/sharephper/blog/100107\" title=\"xss攻击\">xss攻击</a>\n"
+
"\t</li>\n"
+
"\t\t\t</ul>\n"
+
"</div>\n"
+
"<script type=\"text/javascript\" src=\"/action/visit/blog?id=158200\" defer=\"defer\"></script>\n"
+
"<script type=\"text/javascript\" src=\"/js/syntax-highlighter-2.1.382/scripts/brush.js\"></script>\n"
+
"<link type=\"text/css\" rel=\"stylesheet\" href=\"/js/syntax-highlighter-2.1.382/styles/shCore.css\"/>\n"
+
"<link type=\"text/css\" rel=\"stylesheet\" href=\"/js/syntax-highlighter-2.1.382/styles/shThemeDefault.css\"/>\n"
+
"<script type='text/javascript'><!--\n"
+
"$(document).ready(function(){\n"
+
"\tSyntaxHighlighter.config.clipboardSwf = '/js/syntax-highlighter-2.1.382/scripts/clipboard.swf';\n"
+
"\tSyntaxHighlighter.all();\n"
+
"});\n"
+
"//-->\n"
+
"</script>\n"
+
"<!--[if lt IE 7]>\n"
+
"<script type=\"text/javascript\" src=\"/js/minmax.js\"></script>\n"
+
"<![endif]-->\n"
+
"<script type='text/javascript'>\n"
+
"<!--\n"
+
"var posting = false;\n"
+
"var upprev_closed = false;\n"
+
"var upprev_hidden = true;\n"
+
"\n"
+
"$(document).ready(function(){\n"
+
" $('.BlogContent img').css('cursor','pointer');\n"
+
" jQuery.each($('.BlogContent img'),function(idx,v){\n"
+
" \t$(v).wrap(\"<a href='\"+$(this).attr('src')+\"' target='_blank'></a>\");\n"
+
" });\n"
+
"\t$('#form_comment').ajaxForm({\n"
+
"\t\tdataType: 'json',\n"
+
"\t\tbforeSubmit: function(){\n"
+
"\t\t\tposting = true;\n"
+
"\t\t},\n"
+
"\t\tsuccess: function(json) {\n"
+
" \tif(json.msg){\n"
+
"\t\t\t\t///alert(json.msg);\n"
+
"\t\t\t\t$('#cmt_tip').html(\"<span style='color:#C00;'>\"+json.msg+\"</span>\");\n"
+
"\t\t\t\t$('#ta_post_content').focus();\t\t\t\t\n"
+
"\t\t\t}else{\n"
+
"\t\t\t\tvar url = \"http://my.oschina.net/flashsword/blog_post?_cmt_blog=\"+json.blog+\"&_cmt_user=\"+json.user+\"&_cmt_id=\"+json.id;\t\t\t\t\n"
+
" \t\tjQuery.get(url, function(data){\n"
+
" \t\t\t\t$('.BlogComments .NoData').hide();\n"
+
" \t\t\t$('ul#BlogComments').append(data);\n"
+
" \t\t\t$('#form_comment').resetForm();\n"
+
" \t\t}); \n"
+
"\t\t\t}\n"
+
"\t\t}\n"
+
"\t});\n"
+
"\n"
+
" var at_datas = [];\n"
+
" $('img.SmallPortrait').each(function(){\n"
+
" var name = $(this).attr('alt');\n"
+
" if(jQuery.inArray(name, at_datas) < 0 && name != '黄亿华')\n"
+
" at_datas.push(name);\n"
+
" });\n"
+
" $(\"#form_comment textarea\").atWho(\"@\", {data: at_datas});\n"
+
"\n"
+
"\t$(\"#submiting\").ajaxStart(function(){\n"
+
"\t if(posting){\n"
+
" \t $('#btn_submit').attr(\"disabled\",\"disabled\");\n"
+
" $(this).show();\n"
+
"\t }\n"
+
" });\n"
+
"\t$(\"#submiting\").ajaxComplete(function(event,request, settings){\n"
+
"\t if(posting){\n"
+
" $(this).hide();\n"
+
" \t $('#btn_submit').attr(\"disabled\",\"\");\n"
+
"\t }\n"
+
"\t posting = false;\n"
+
" }); \n"
+
"\t\n"
+
" $(window).scroll(function() {\n"
+
" var lastScreen;\n"
+
" if ($(\"#postform\").length > 0)\n"
+
" lastScreen = getScrollY() + $(window).height() < $(\"#postform\").offset().top * 1 ? false : true;\n"
+
" else\n"
+
" lastScreen = getScrollY() + $(window).height() < $(document).height() * 1 ? false : true;\n"
+
" if (lastScreen && !upprev_closed) {\n"
+
" $(\"#RelativeBlogs\").stop().animate({right:\"0px\"});\n"
+
" upprev_hidden = false;\n"
+
" }\n"
+
" else if (upprev_closed && getScrollY() == 0) {\n"
+
" upprev_closed = false;\n"
+
" }\n"
+
" else if (!upprev_hidden) {\n"
+
" upprev_hidden = true;\n"
+
" $(\"#RelativeBlogs\").stop().animate({right:\"-400px\"});\n"
+
" }\n"
+
" });\n"
+
" $(\"#RelativeBlogs #btn_close\").click(function() {\n"
+
" $(\"#RelativeBlogs\").stop().animate({right:\"-400px\"});\n"
+
" upprev_closed = true;\n"
+
" upprev_hidden = true;\n"
+
" });\n"
+
"});\n"
+
"function delete_c(nid,uid,cid){\n"
+
" if(confirm(\"您确认要删除此篇评论?\")){\n"
+
" var args = \"cmt=\"+cid+\"#\"+uid+\"#\"+nid;\n"
+
" ajax_post(\"/action/blog/delete_blog_comments?space=190591\",args,function(){$(\"#cmt_\"+nid+\"_\"+uid+\"_\"+cid).fadeOut();});\n"
+
" }\n"
+
"}\n"
+
"function ReplyInline(blog,user,reply){\n"
+
"\t$('.inline_reply').empty();\n"
+
"\tvar div_id = '#inline_reply_of_'+blog+'_'+user+'_'+reply;\n"
+
"\t$('#inline_reply_id').val(user+'_'+reply);\n"
+
"\t$(div_id).html($('#inline_reply_editor').html());\n"
+
"\t$('#txt_focus').focus();\n"
+
"\t$('#btn_close_inline_reply').click(function(){\n"
+
"\t\t$(div_id).empty();\n"
+
"\t});\n"
+
"\t$('#form_inline_comment').ajaxForm({\n"
+
"\t\tdataType: 'json',\n"
+
" \tsuccess: function(json) {\n"
+
" \tif(json.msg){\n"
+
" \t\talert(json.msg);\n"
+
" \t}\n"
+
" \telse if(json.id){\n"
+
" \t\t\tlocation.reload();\n"
+
" \t}\n"
+
" \t}\n"
+
"\t});\n"
+
"}\n"
+
"function edit_catalogs(qid){\n"
+
"\tpopup(\"/set-catalogs?parent=1&type=3&id=\"+qid);\n"
+
"}\n"
+
"function vote(blogid){\n"
+
"\t\tajax_post(\"/action/blog/vote\",\"id=\"+blogid+\"&user=190591\",function(result){\n"
+
"\t\tvar json = eval('('+result+')');\n"
+
"\t\tif(json.vote)\n"
+
"\t\t\t$('#vote_count').html(json.vote);\n"
+
"\t\telse if(json.error == 1)\n"
+
"\t\t\talert(json.msg);\n"
+
"\t\telse\n"
+
"\t\t\talert(json.msg);\n"
+
"\t});\n"
+
"\t}\n"
+
"function toggle_recomm(blogid){\n"
+
"\tajax_post(\"/action/blog/toggle_recomm\",\"id=\"+blogid,function(html){\n"
+
"\t\tif(html == '-1')\n"
+
"\t\t\talert(\"文章不存在\");\n"
+
"\t\telse if(html == 0){\n"
+
"\t\t\t$('#lnk_recomm_'+blogid).removeClass('recommend');\n"
+
"\t\t\t$('#lnk_recomm_'+blogid).text(\"未推荐\");\n"
+
"\t\t}\n"
+
"\t\telse if(html == 1){\n"
+
"\t\t\t$('#lnk_recomm_'+blogid).addClass('recommend');\n"
+
"\t\t\t$('#lnk_recomm_'+blogid).text(\"已推荐\");\n"
+
"\t\t}\n"
+
"\t});\n"
+
"}\n"
+
"//-->\n"
+
"</script></div>\n"
+
"\t<div class='clear'></div>\n"
+
"\t<div id=\"OSC_Footer\"><style>\n"
+
".oscapp {text-align:left; width:220px;}\n"
+
".oscapp span {float:left;width:140px;}\n"
+
".oscapp a {float:left;text-indent:-9999em;width:16px;margin-left:8px;}\n"
+
".oscapp a.android {background:url('/img/android.gif') no-repeat left center;}\n"
+
".oscapp a.iphone {background:url('/img/iphone.gif') no-repeat left center;}\n"
+
".oscapp a.wp7 {background:url('/img/wp7.gif') no-repeat left center;}\n"
+
"</style>\n"
+
"<table width='100%'><tr>\n"
+
"<td align='left'>© 开源中国(OsChina.NET) | <a href=\"http://www.oschina.net/home/about\">关于我们</a> | <a href=\"mailto:oschina.net@gmail.com\">广告联系</a> | <a href=\"http://weibo.com/oschina2010\" target=\"_blank\">@新浪微博</a> | <a href=\"http://m.oschina.net/\">开源中国手机版</a> | <a href='http://www.miitbeian.gov.cn/' target='_blank' style='color:#737573;text-decoration:none;'>粤ICP备12009483号-3</a></td>\n"
+
"<td class='oscapp'>\n"
+
"\t<span>开源中国手机客户端:</span>\n"
+
"\t<a href=\"http://www.oschina.net/app\" class='android' title='Android客户端'>Android</a>\n"
+
"\t<a href=\"http://www.oschina.net/app\" class='iphone' title='iPhone 客户端'>iPhone</a>\n"
+
"\t<a href=\"http://www.oschina.net/app\" class='wp7' title='Windows Phone 客户端'>WP7</a>\n"
+
"</td>\n"
+
"</tr>\n"
+
"</table>\n"
+
"<script type='text/javascript'>\n"
+
"<!--\n"
+
"if (top.location != self.location)top.location=self.location;\n"
+
"//-->\n"
+
"</script></div>\n"
+
"</div>\n"
+
"</body>\n"
+
"\n"
+
"<script type=\"text/javascript\" src=\"/action/visit/space?id=190591\"></script>\n"
+
"<script type='text/javascript'>\n"
+
"<!--\n"
+
"$(document).ready(function() {\n"
+
"\n"
+
"\tSelectStyle(\"#search-item\",\".search_list\");\n"
+
"\t$('.Tweet .photo img').live(\"click\",function(){\n"
+
"\t\tvar T=$(this);\n"
+
"\t\tvar t=this;\n"
+
"\t\tvar bigImg = T.attr('bi');\n"
+
"\t\tvar smallImg = T.attr('si');\n"
+
"\t\tvar src = T.attr('src');\n"
+
"\t\tvar newsrc = (bigImg == src)?smallImg:bigImg;\n"
+
"\t\tvar imgId = T.attr('id');\n"
+
"\t\tif(newsrc == bigImg){\n"
+
" \t\tvar loading=$('<img alt=\"loading\" src=\"/img/loading.gif\"/>');\n"
+
"\t\t\tvar top = T.position().top+T.height()/2-8;\n"
+
"\t\t\tvar left = T.position().left+T.width()/2-8;\n"
+
"\t\t\tloading.css({\n"
+
"\t\t\t\t'position':'absolute',\n"
+
"\t\t\t\t'z-index':999,\n"
+
"\t\t\t\t'top':top,\n"
+
"\t\t\t\t'left':left\n"
+
"\t\t\t});\n"
+
" \t\tT.before(loading);\n"
+
"\t\t\tvar tImg=new Image();\n"
+
"\t\t\ttImg.src=newsrc;\n"
+
"\t\t\ttImg.onload=function(){afterImgLoad(T,loading,imgId,newsrc,bigImg);};\n"
+
"\t\t}\n"
+
"\t\telse{\n"
+
"\t\t\tT.attr(\"src\",newsrc);\n"
+
"\t\t\t$('#img_menu_'+imgId).remove();\n"
+
"\t\t}\n"
+
"\t\treturn false;\n"
+
"\t});\n"
+
"\t\n"
+
"\t$(\".tweet_thumb_wrapper\").mouseenter(function(){\n"
+
"\t\t$(this).find(\".tweet_play_video\").css(\"opacity\",1);\n"
+
"\t}).mouseleave(function(){\n"
+
"\t\t$(this).find(\".tweet_play_video\").css(\"opacity\",0.7);\n"
+
"\t});\n"
+
"\n"
+
" $(\"#TForm textarea\").atWho(\"@\", function(query, callback){\n"
+
" jQuery.ajax({\n"
+
" type:'POST',\n"
+
" url:\"/action/tweet/at_suggest\",\n"
+
" data:{'q':query},\n"
+
" dataType:'json',\n"
+
" success:function(json){\n"
+
" callback(json);\n"
+
" }\n"
+
" });\n"
+
" });\n"
+
"\t\n"
+
"\ttoggle_tweet_video = function(id){\n"
+
"\t\t$(\"#tweet_video_thumb_\"+id).toggle();\n"
+
"\t\tvar video = $(\"#tweet_video_\"+id).toggle();\n"
+
"\t\tvideo.siblings(\".tweet_video_operation,.tweet_thumb_wrapper\").toggle();\n"
+
"\t};\n"
+
"\t\n"
+
"\tfunction afterImgLoad(T,loading,imgId,url,bigImg){\n"
+
"\t\tvar lnks = \"<div id='img_menu_\"+imgId+\"' class='ImgMenu'>\";\n"
+
"\t\tlnks += \"<a href='#' onclick='$(\\\"#\"+imgId+\"\\\").click();return false;'>收起</a>\";\n"
+
"\t\tlnks += \"<a href='\"+bigImg+\"' target='_blank'>查看原图</a></div>\";\t\t\t\n"
+
"\t\tloading.remove();\n"
+
"\t\tT.attr(\"src\",url);\n"
+
"\t\tT.before(lnks);\n"
+
"\t}\n"
+
"});\n"
+
"\n"
+
"function set_fellow_memo(fid,fname){\n"
+
"\tpopup(\"/action/ajax/set_fellow_memo\",\"friend=\"+fid+\"&name=\"+fname);\n"
+
"}\n"
+
"\n"
+
"function deleteMsgs(uid, fid, fname){\n"
+
"\tif(!confirm(\"你确认要清除与‘\"+fname+\"’的所有留言信息吗?\"))\n"
+
"\t\treturn ;\n"
+
"\tvar args = \"user=\"+uid+\"&friend=\"+fid;\n"
+
"\tajax_post(\"/action/msg/delete_user\",args,function(html){\n"
+
"\t\tif(html.length > 0)\n"
+
"\t\t\talert(html);\n"
+
"\t\telse{\n"
+
"\t\t\t$('#Msg_'+fid).fadeOut();\n"
+
"\t\t}\n"
+
"\t});\n"
+
"}\n"
+
"\n"
+
"function follow_user(uid, uname){\n"
+
"\tjust_follow(uid, uname,'190591'); //oschina.js\n"
+
"}\n"
+
"\n"
+
"function unfollow_user(uid, uname){\n"
+
"\tif(confirm(\"确定不再关注\" + uname + \"了吗?\"))\n"
+
"\tjust_unfollow(uid,'190591',function(){\n"
+
"\t\talert('已取消对 ' + uname + ' 的关注');\n"
+
"\t});\n"
+
"}\n"
+
"\n"
+
"function tweet_reply(logid){\n"
+
"\tvar r = $('#LogReply_'+logid);\n"
+
"\tif(!r.is(\":hidden\")){\n"
+
"\t\tclose_tweet_reply(logid);\n"
+
"\t\treturn ;\n"
+
"\t}\n"
+
"\tr.html(\"<div class='TweetRplsWrapper'><span class='loading'>正在加载评论,请稍候...</span></div>\")\n"
+
"\tr.show();\n"
+
"\tr.load(\"http://my.oschina.net/flashsword/tweet-rpls?log=\"+logid,function(){\n"
+
"\t\t$('#edt_tweet_post_'+logid).focus();\n"
+
" var at_datas = [];\n"
+
" $(this).find(\"img.SmallPortrait\").each(function(){\n"
+
" var name = $(this).attr('alt');\n"
+
" if(jQuery.inArray(name, at_datas) < 0 && name != '黄亿华')\n"
+
" at_datas.push(name);\n"
+
" });\n"
+
" $(this).find(\"input.TXT_TweetRpl_Text\").atWho(\"@\", {data: at_datas});\n"
+
" $('#TweetReplyForm_'+logid).ajaxForm({\n"
+
" \tdataType: 'json',\n"
+
"\t\t\tbeforeSubmit: function(arr, form, options){\n"
+
"\t\t\t\t$('#BTN_TweetReply_'+logid).attr('disabled','disabled');\n"
+
"\t\t\t},\n"
+
" success: function(json) {\n"
+
" \tif(json.msg){\n"
+
" \t\t\talert(json.msg);\n"
+
" \t}else if(json.log){\n"
+
"\t\t\t\t\t$('#log_reply_count_'+logid).text(json.reply_count);\n"
+
" \t\t\t//插入新评论\t\t\t\t\t\n"
+
"\t\t\t\t\tajax_get(\"/action/ajax/get_tweet_reply?id=\" + json.log,true,function(html){\n"
+
"\t\t\t\t\t\t$('#LogReply_'+logid+' ul').prepend(html);\n"
+
"\t\t\t\t\t});\n"
+
"\t\t\t\t\t$('#edt_tweet_post_'+logid).val('');\n"
+
" \t}\n"
+
"\t\t\t\t$('#BTN_TweetReply_'+logid).removeAttr('disabled');\n"
+
" }\n"
+
" });\n"
+
"\t});\n"
+
"}\n"
+
"function close_tweet_reply(logid){\n"
+
"\t$('#LogReply_'+logid).empty();\n"
+
"\t$('#LogReply_'+logid).hide();\n"
+
"\t$('#Logs .userlogs li').removeClass('hover');\n"
+
"}\n"
+
"function reply_rtweet(logid, rid, toname){\n"
+
"\tvar edtPost = $('#edt_tweet_post_' + logid);\n"
+
"\tvar old_v = edtPost.val();\n"
+
"\tif(old_v.length > 0)\n"
+
"\t\tedtPost.val(old_v + ',@'+toname+' ');\n"
+
"\telse\n"
+
"\t\tedtPost.val('回复 @'+toname+' : ');\n"
+
"\tedtPost.focus();\n"
+
"\tedtPost.caretPos(edtPost.val().length); }\n"
+
"function delete_tweet(logid){\n"
+
"\tif(confirm(\"确认要删除这条信息吗?\"))\n"
+
"\tajax_post(\"/action/tweet/delete?log=\"+logid+\"&user=190591\",\"\",function(html){\n"
+
"\t\tif(html.length==0){\n"
+
"\t\t\tvar elem = $('#LI_'+logid);\n"
+
"\t\t\tif(elem.length > 0)\n"
+
"\t\t\t\t$('#LI_'+logid).fadeOut();\n"
+
"\t\t\telse\n"
+
"\t\t\t\tlocation.reload();\n"
+
"\t\t}\n"
+
"\t});\n"
+
"}\n"
+
"function delete_tweet_reply(logid){\n"
+
"\tif(confirm(\"确认要删除这条评论吗?\"))\n"
+
"\tajax_post(\"/action/tweet/delete_reply?id=\"+logid+\"&user=190591\",\"\",function(html){\n"
+
"\t\tif(html.length==0)\n"
+
"\t\t\t$('#TweetReply_'+logid).fadeOut();\n"
+
"\t});\n"
+
"}\n"
+
"\n"
+
"function delete_blog(blog_id){\n"
+
" if(!confirm(\"文章删除后无法恢复,请确认是否删除此篇文章?\")) return;\n"
+
" ajax_post(\"/action/blog/delete?id=\"+blog_id+\"&user=190591&user_code=tzm9Wg2YoU8SkJaTIjHQkahStiXQNyymUGXFOQgN\",\"\",function(html){\n"
+
" \tlocation.href=\"http://my.oschina.net/flashsword/blog\";\n"
+
" });\n"
+
"}\n"
+
"\n"
+
"function SelectStyle(on,option){\n"
+
"\tvar currentSort = $(on).attr('id');\n"
+
"\tvar currentText = $(option+\" li.\"+currentSort+\" a\").html();\n"
+
"\t$(on + \" .text\").html(currentText);\n"
+
"\t$(on + \" .text\").hover(function(){\n"
+
"\t\t$(this).addClass(\"hover\")\n"
+
"\t},function(){\n"
+
"\t\t$(this).removeClass(\"hover\")\n"
+
"\t});\n"
+
"\t$(option+\" li a\").each(function(index){\n"
+
"\t\t$(this).click(function(){\n"
+
"\t\t\tthishtml = $(this).html();\n"
+
"\t\t\t$(on + \" .text\").removeClass(\"on\").html(thishtml);\t\t\n"
+
"\t\t\t$(\".selectbox select \").find(\"option\").removeAttr('selected').eq(index).attr(\"selected\",\"selected\");\t\n"
+
"\t\t\t$(option).hide()\n"
+
"\t\t\treturn false;\n"
+
"\t\t});\n"
+
"\t\t\n"
+
"\t});\t\t\n"
+
"\t\n"
+
"\t$(\".selectbox\").click(function(){\t\t\n"
+
"\t\t$(option).toggle();\n"
+
"\t\t$(on + \" .text\").toggleClass(\"on\");\t\t\n"
+
"\t\treturn false;\n"
+
"\t});\n"
+
"\t$(document).click(function(){\n"
+
"\t\t$(option).hide();\t\n"
+
"\t\t$(on + \" .text\").removeClass(\"on\");\n"
+
"\t});\n"
+
"\t$(document).trigger('click');\n"
+
"\n"
+
"}\n"
+
"\n"
+
"//-->\n"
+
"</script>\n"
+
"</html>\n"
+
"\n"
+
"<!-- Generated by OsChina.NET (init:0[ms],page:83[ms],ip:58.241.37.50) -->"
;
}
webmagic-saxon/pom.xml
View file @
e06b0c1a
...
@@ -17,6 +17,11 @@
...
@@ -17,6 +17,11 @@
<artifactId>
webmagic-core
</artifactId>
<artifactId>
webmagic-core
</artifactId>
<version>
${project.version}
</version>
<version>
${project.version}
</version>
</dependency>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
xsoup
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<dependency>
<groupId>
net.sf.saxon
</groupId>
<groupId>
net.sf.saxon
</groupId>
<artifactId>
Saxon-HE
</artifactId>
<artifactId>
Saxon-HE
</artifactId>
...
...
webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
View file @
e06b0c1a
package
us
.
codecraft
.
webmagic
.
selector
;
package
us
.
codecraft
.
webmagic
.
selector
;
import
org.htmlcleaner.HtmlCleaner
;
import
org.htmlcleaner.TagNode
;
import
org.htmlcleaner.XPatherException
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.junit.Assert
;
import
org.junit.Assert
;
import
org.junit.Ignore
;
import
org.junit.Ignore
;
import
org.junit.Test
;
import
org.junit.Test
;
import
us.codecraft.xsoup.XPathEvaluator
;
import
us.codecraft.xsoup.Xsoup
;
/**
/**
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
...
@@ -1353,6 +1360,7 @@ public class XpathSelectorTest {
...
@@ -1353,6 +1360,7 @@ public class XpathSelectorTest {
Html
html1
=
new
Html
(
html
);
Html
html1
=
new
Html
(
html
);
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
xpath
(
".//*[@class='QTitle']/h1/a"
).
toString
());
Assert
.
assertEquals
(
"再次吐槽easyui"
,
html1
.
xpath
(
".//*[@class='QTitle']/h1/a"
).
toString
());
Assert
.
assertNotNull
(
html1
.
$
(
"a[href]"
).
xpath
(
"//@href"
).
all
());
Assert
.
assertNotNull
(
html1
.
$
(
"a[href]"
).
xpath
(
"//@href"
).
all
());
Selectors
.
xpath
(
"/abc/"
).
select
(
""
);
}
}
@Test
@Test
...
@@ -1379,17 +1387,86 @@ public class XpathSelectorTest {
...
@@ -1379,17 +1387,86 @@ public class XpathSelectorTest {
xpath2Selector
.
selectList
(
html
);
xpath2Selector
.
selectList
(
html
);
}
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
XpathSelector
xpathSelector
=
new
XpathSelector
(
"//a"
);
XpathSelector
xpathSelector
=
new
XpathSelector
(
"//a"
);
time
=
System
.
currentTimeMillis
();
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
xpathSelector
.
selectList
(
html
);
xpathSelector
.
selectList
(
html
);
}
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
xpath2Selector
.
selectList
(
html
);
xpath2Selector
.
selectList
(
html
);
}
}
System
.
out
.
println
(
System
.
currentTimeMillis
()
-
time
);
CssSelector
cssSelector
=
new
CssSelector
(
"a"
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
1000
;
i
++)
{
cssSelector
.
selectList
(
html
);
}
System
.
out
.
println
(
"css "
+(
System
.
currentTimeMillis
()-
time
));
}
@Ignore
(
"take long time"
)
@Test
public
void
parserPerformanceTest
()
throws
XPatherException
{
System
.
out
.
println
(
html
.
length
());
HtmlCleaner
htmlCleaner
=
new
HtmlCleaner
();
TagNode
tagNode
=
htmlCleaner
.
clean
(
html
);
Document
document
=
Jsoup
.
parse
(
html
);
long
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
htmlCleaner
.
clean
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
tagNode
.
evaluateXPath
(
"//a"
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
"============="
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
Jsoup
.
parse
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
document
.
select
(
"a"
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
"============="
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
htmlCleaner
.
clean
(
html
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
tagNode
.
evaluateXPath
(
"//a"
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
System
.
out
.
println
(
"============="
);
XPathEvaluator
compile
=
Xsoup
.
compile
(
"//a"
);
time
=
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
2000
;
i
++)
{
compile
.
evaluate
(
document
);
}
System
.
out
.
println
(
System
.
currentTimeMillis
()-
time
);
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment