Commit 1bdffc1e authored by 黄亿华's avatar 黄亿华

Merge pull request !337 from Almark Ming/master

parents 0c3ff3d6 91ed66ec
package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* Selector in regex.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class RegexSelector implements Selector {
private String regexStr;
private Pattern regex;
private int group = 1;
public RegexSelector(String regexStr, int group) {
if (StringUtils.isBlank(regexStr)) {
throw new IllegalArgumentException("regex must not be empty");
}
if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) {
regexStr = "(" + regexStr + ")";
}
if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) {
throw new IllegalArgumentException("regex must have capture group 1");
}
this.regexStr = regexStr;
try {
regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("invalid regex", e);
}
this.group = group;
}
public RegexSelector(String regexStr) {
this(regexStr, 1);
}
@Override
public String select(String text) {
return selectGroup(text).get(group);
}
@Override
public List<String> selectList(String text) {
List<String> strings = new ArrayList<String>();
List<RegexResult> results = selectGroupList(text);
for (RegexResult result : results) {
strings.add(result.get(group));
}
return strings;
}
public RegexResult selectGroup(String text) {
Matcher matcher = regex.matcher(text);
if (matcher.find()) {
String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i);
}
return new RegexResult(groups);
}
return RegexResult.EMPTY_RESULT;
}
public List<RegexResult> selectGroupList(String text) {
Matcher matcher = regex.matcher(text);
List<RegexResult> resultList = new ArrayList<RegexResult>();
while (matcher.find()) {
String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i);
}
resultList.add(new RegexResult(groups));
}
return resultList;
}
@Override
public String toString() {
return regexStr;
}
}
package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* Selector in regex.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class RegexSelector implements Selector {
private String regexStr;
private Pattern regex;
private int group = 1;
public RegexSelector(String regexStr, int group) {
if (StringUtils.isBlank(regexStr)) {
throw new IllegalArgumentException("regex must not be empty");
}
/* Can't detect '\(', '(?:)' so that would result in ArrayIndexOutOfBoundsException
if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) {
regexStr = "(" + regexStr + ")";
}
if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) {
throw new IllegalArgumentException("regex must have capture group 1");
}
*/
// Try to fix: Only check if there exists the valid left parenthesis, leave regexp validation for Pattern
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\\\\\(") ==
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\\\\\(?:")) {
regexStr = "(" + regexStr + ")";
}
this.regexStr = regexStr;
try {
regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("invalid regex", e);
}
this.group = group;
}
public RegexSelector(String regexStr) {
this(regexStr, 1);
}
@Override
public String select(String text) {
return selectGroup(text).get(group);
}
@Override
public List<String> selectList(String text) {
List<String> strings = new ArrayList<String>();
List<RegexResult> results = selectGroupList(text);
for (RegexResult result : results) {
strings.add(result.get(group));
}
return strings;
}
public RegexResult selectGroup(String text) {
Matcher matcher = regex.matcher(text);
if (matcher.find()) {
String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i);
}
return new RegexResult(groups);
}
return RegexResult.EMPTY_RESULT;
}
public List<RegexResult> selectGroupList(String text) {
Matcher matcher = regex.matcher(text);
List<RegexResult> resultList = new ArrayList<RegexResult>();
while (matcher.find()) {
String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i);
}
resultList.add(new RegexResult(groups));
}
return resultList;
}
@Override
public String toString() {
return regexStr;
}
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment