Commit 9866297e authored by yihua.huang's avatar yihua.huang

Disable jsoup entity escape by Default. Set Html.DISABLE_HTML_ENTITY_ESCAPE to...

Disable jsoup entity escape by Default. Set Html.DISABLE_HTML_ENTITY_ESCAPE to false to enable it.  #149
parent 4e6e946d
......@@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -19,6 +20,24 @@ public class Html extends HtmlNode {
private Logger logger = LoggerFactory.getLogger(getClass());
private static volatile boolean INITED = false;
/**
* Disable jsoup html entity escape. It can be set just before any Html instance is created.
*/
public static boolean DISABLE_HTML_ENTITY_ESCAPE = true;
/**
* Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
*/
private void disableJsoupHtmlEntityEscape() {
if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
Entities.EscapeMode.base.getMap().clear();
Entities.EscapeMode.extended.getMap().clear();
INITED = true;
}
}
/**
* Store parsed document for better performance when only one text exist.
*/
......@@ -26,6 +45,7 @@ public class Html extends HtmlNode {
public Html(String text) {
try {
disableJsoupHtmlEntityEscape();
this.document = Jsoup.parse(text);
} catch (Exception e) {
this.document = null;
......
......@@ -3,6 +3,8 @@ package us.codecraft.webmagic;
import org.junit.Test;
import us.codecraft.webmagic.selector.Html;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
......@@ -13,9 +15,19 @@ public class HtmlTest {
@Test
public void testRegexSelector() {
Html selectable = new Html("aaaaaaab");
// Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
System.out.println(selectable.regex("(.*)").replace("aa(a)", "$1bb").toString());
assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab");
}
@Test
public void testDisableJsoupHtmlEntityEscape() throws Exception {
Html html = new Html("aaaaaaa&b");
assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
}
@Test
public void testEnableJsoupHtmlEntityEscape() throws Exception {
Html.DISABLE_HTML_ENTITY_ESCAPE = false;
Html html = new Html("aaaaaaa&b");
assertThat(html.regex("(aaaaaaa&amp;b)").toString()).isEqualTo("aaaaaaa&amp;b");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment