java根据中文包含词和中文排除词,匹配文章中的命中的词组规则-创新互联
文章:"小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因";
创新互联为企业级客户提高一站式互联网+设计服务,主要包括成都网站设计、网站建设、app软件开发公司、小程序制作、宣传片制作、LOGO设计等,帮助客户快速提升营销能力和企业形象,创新互联各部门都有经验丰富的经验,可以确保每一个作品的质量和创作周期,同时每年都有很多新员工加入,为我们带来大量新的创意。+代表与,|代码或,整个规则必须用()括起来
当排除词的规则命中其中一个时,整篇文章视为不匹配,返回false
包含词规则:"(小学生|(思绪|想象力+适应期))"
排除词规则:"(小x生)"
源码:WordMatcher.java
类文件 BoolParse.java在 我以前发布的文章《java 求字符串形式bool表达式的值》中:
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.function.Function;
public class WordMatcher {
private final ListincludeExprList;
private final ListexcludeExprList;
private final MapwordMapId = new HashMap<>();
private final MapidMapWord = new HashMap<>();
private final Tree tree = new Tree();
private SethitExpr = null;
public WordMatcher(String includeExpr, String excludeExpr) {
if (StringUtils.isBlank(includeExpr)) throw new RuntimeException("包含词不能为空");
Function>function = (expr) ->{
ListexprList = new ArrayList<>();
StringBuilder newExpr = new StringBuilder();
StringBuilder word = new StringBuilder();
int level = 0;
for (char aChar : expr.toCharArray()) {
switch (aChar) {
case ' ':
continue;
case '(':
level++;
if (level == 1) continue;
if (level >1) newExpr.append(aChar);
continue;
case ')':
level--;
break;
case '|':
break;
case '+':
aChar = '&';
break;
default:
word.append(aChar);
continue;
}
String s = word.toString();
if (s.isEmpty()) {
if( level == 0) break;
throw new RuntimeException("表达式语法错误:" + expr);
}
tree.insert(s);
char id;
if (!wordMapId.containsKey(s)) {
char c = (char) (wordMapId.size() + 256);
wordMapId.put(s, c);
idMapWord.put(c, s);
id = c;
} else {
id = wordMapId.get(s);
}
word = new StringBuilder();
newExpr.append(id);
if (level >1 || (level == 1 && aChar != '|')) newExpr.append(aChar);
if (level == 1&& aChar == '|') {
exprList.add(newExpr.toString());
newExpr = new StringBuilder();
}
}
if (level != 0 || word.length() != 0) throw new RuntimeException("表达式语法错误:" + expr);
if (newExpr.length() != 0) {
exprList.add(newExpr.toString());
}
return exprList;
};
this.includeExprList = function.apply(includeExpr);
if (includeExprList.isEmpty()) throw new RuntimeException("包含词表达式不能为空");
if (StringUtils.isBlank(excludeExpr)) this.excludeExprList = new ArrayList<>();
else this.excludeExprList = function.apply(excludeExpr);
}
public boolean match(String content) {
Objects.requireNonNull(content);
if (content.isEmpty()) throw new RuntimeException("empty string");
SetexistWords = new HashSet<>();
Listbuilders = new ArrayList<>();
for (char c : content.toCharArray()) {
{
for (StringBuilder builder : builders) {
builder.append(c);
}
String str = c + "";
boolean b = tree.containsTheWord(str);
if (b) {
builders.add(new StringBuilder(str));
}
if (tree.existTheWord(str)) {
existWords.add(str);
}
}
builders.removeIf(builder ->{
if (builder.length() == 1) return false;
String str = builder.toString();
boolean b = tree.containsTheWord(str);
if (!b) return true;
boolean exist = tree.existTheWord(str);
if (exist) {
existWords.add(str);
}
return false;
});
}
for (String exclude : excludeExprList) {
for (String word : wordMapId.keySet()) {
String id = wordMapId.get(word) + "";
if (existWords.contains(word)) {
exclude = exclude.replaceAll(id + "", "T");
} else {
exclude = exclude.replaceAll(id + "", "F");
}
}
if (BoolParser.parse(exclude)) {
hitExpr = new HashSet<>();
return false;
}
}
SetexprSet = new HashSet<>();
boolean result = false;
for (String include : includeExprList) {
String includeTemp = include;
for (String word : wordMapId.keySet()) {
String id = wordMapId.get(word) + "";
if (existWords.contains(word)) {
include = include.replaceAll(id + "", "T");
} else {
include = include.replaceAll(id + "", "F");
}
}
boolean parse = BoolParser.parse(include);
if (parse) {
for (Character id : idMapWord.keySet()) {
String word = idMapWord.get(id);
includeTemp = includeTemp.replaceAll(id + "", word);
}
exprSet.add(includeTemp.replaceAll("&", "+"));
}
if (!result) result = BoolParser.parse(include);
}
hitExpr = exprSet;
return result;
}
public SethitExpr() {
if (hitExpr == null) throw new RuntimeException("请先匹配文章");
return hitExpr;
}
//字典树
private static class Tree {
private final Mapnodes = new HashMap<>();
public Tree() {
}
public void insert(String word) {
Objects.requireNonNull(word);
if (word.isEmpty()) return;
char[] chars = word.toCharArray();
Node head = nodes.computeIfAbsent(chars[0], Node::new);
for (int i = 1; i< chars.length; i++) {
char aChar = chars[i];
head = head.putChild(aChar);
}
}
public boolean containsTheWord(String word) {
Objects.requireNonNull(word);
if (word.isEmpty()) throw new RuntimeException("empty string");
char[] chars = word.toCharArray();
if (!nodes.containsKey(chars[0])) return false;
Node node = nodes.get(chars[0]);
for (int i = 1; i< chars.length; i++) {
char aChar = chars[i];
Node child = node.getChild(aChar);
if (child == null) return false;
node = child;
}
return true;
}
public boolean existTheWord(String word) {
Objects.requireNonNull(word);
if (word.isEmpty()) throw new RuntimeException("empty string");
char[] chars = word.toCharArray();
if (!nodes.containsKey(chars[0])) return false;
Node node = nodes.get(chars[0]);
for (int i = 1; i< chars.length; i++) {
char aChar = chars[i];
Node child = node.getChild(aChar);
if (child == null) return false;
node = child;
}
return node.isEnd();
}
}
private static class Node {
protected final char value;
private final MapchildNodes = new HashMap<>();
public Node(char value) {
this.value = value;
}
public Node putChild(char value) {
return childNodes.computeIfAbsent(value, Node::new);
}
public Node getChild(char value) {
return childNodes.get(value);
}
public boolean containsNode(char value) {
return childNodes.containsKey(value);
}
public boolean isEnd() {
return childNodes.isEmpty();
}
}
}
使用示例1
public static void main(String[] args) {
WordMatcher matcher = new WordMatcher("(小学生|(思绪|想象力+适应a))", "(小x生)");
boolean result = matcher.match("小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因");
System.out.println(result);//false
System.out.println(matcher.hitExpr());//[]
}
使用示例2
public static void main(String[] args) {
WordMatcher matcher = new WordMatcher("(小学生|(思绪|想象力+适应期))", "");
boolean result = matcher.match("小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因");
System.out.println(result);//true
System.out.println(matcher.hitExpr());//[(思绪|想象力+适应期), 小学生]
}
你是否还在寻找稳定的海外服务器提供商?创新互联www.cdcxhl.cn海外机房具备T级流量清洗系统配攻击溯源,准确流量调度确保服务器高可用性,企业级服务器适合批量采购,新人活动首月15元起,快前往官网查看详情吧
文章标题:java根据中文包含词和中文排除词,匹配文章中的命中的词组规则-创新互联
浏览地址:http://pcwzsj.com/article/igdgd.html