SQL分词器1.10版
处理SQL及分词效果:
select * from ( select rownum as rn,tb1.stuid,tb1.summary from ( select stuid,sum(score) as summary from gk_score group by stuid order by summary desc ) tb1 order by tb1.summary desc ) tb2 where rn<11
Index Type No Text Type Desc
------------------------------------------------------------------------------------
0 1 select KW:select
1 2 * Text
2 4 from KW:from
3 18 ( (
4 1 select KW:select
5 2 rownum Text
6 13 as KW:as
7 2 rn Text
8 3 , Comma
9 2 tb1.stuid Text
10 3 , Comma
11 2 tb1.summary Text
12 4 from KW:from
13 18 ( (
14 1 select KW:select
15 2 stuid Text
16 3 , Comma
17 2 sum Text
18 18 ( (
19 2 score Text
20 19 ) )
21 13 as KW:as
22 2 summary Text
23 4 from KW:from
24 2 gk_score Text
25 14 group KW:group
26 10 by KW:by
27 2 stuid Text
28 9 order KW:order
29 10 by KW:by
30 2 summary Text
31 11 desc KW:asc
32 19 ) )
33 2 tb1 Text
34 9 order KW:order
35 10 by KW:by
36 2 tb1.summary Text
37 11 desc KW:asc
38 19 ) )
39 2 tb2 Text
40 5 where KW:where
41 2 rn Text
42 16 < <
43 2 11 Text
程序:
package com.heyang.easysql.lex10; import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List; class Token{
static final int TYPE_SELECT=1;
static final int TYPE_TEXT=2;
static final int TYPE_COMMA=3;
static final int TYPE_FROM=4;
static final int TYPE_WHERE=5;
static final int TYPE_AND=6;
static final int TYPE_EQUAL=7;
static final int TYPE_OR=8;
static final int TYPE_ORDER=9;
static final int TYPE_BY=10;
static final int TYPE_ASC=11;
static final int TYPE_DESC=12;
static final int TYPE_AS=13;
static final int TYPE_GROUP=14;
static final int TYPE_HAVING=15;
static final int TYPE_LESSTHAN=16;
static final int TYPE_GREATERTHAN=17;
static final int TYPE_OPEN_PARENTHESIS=18;
static final int TYPE_CLOSE_PARENTHESIS=19;
static final int TYPE_CONNECT=20;
static final int TYPE_LESSTHAN_OR_EQUAL=21;
static final int TYPE_GREATERTHAN_OR_EQUAL=22;
static final int TYPE_LESSTHAN_OR_GREATERTHAN=23;
static final int TYPE_CASE=24;
static final int TYPE_WHEN=25;
static final int TYPE_THEN=26;
static final int TYPE_ELSE=27;
static final int TYPE_END=28;
static final int TYPE_IS=29;
static final int TYPE_NULL=30;
static final int TYPE_TRUE=31;
static final int TYPE_FALSE=32;
static final int TYPE_PLUS=33;
static final int TYPE_MINUS=34;
//static final int TYPE_MULTI=35;
static final int TYPE_DEVIDE=36;
static final int TYPE_DISTINCT=37;
static final int TYPE_OVER=38;
static final int TYPE_STRING_CONCAT=39;
static final int TYPE_ON=40;
static final int TYPE_JOIN=41;
static final int TYPE_INNER=42;
static final int TYPE_LEFT=43;
static final int TYPE_RIGHT=44;
static final int TYPE_OUTER=45;
static final int TYPE_FULL=46;
static final int TYPE_WITHIN=47;
static final int TYPE_PARTITION=48; int type;
String text; public Token(char c,int type) {
this.text=String.valueOf(c);
this.type=type;
} public Token(String word,int type) {
this.text=word;
this.type=type;
} public String getTypeStr() {
if(type==TYPE_SELECT) {
return "KW:select";
}else if(type==TYPE_FROM) {
return "KW:from";
}else if(type==TYPE_COMMA) {
return "Comma";
}else if(type==TYPE_TEXT) {
return "Text";
}else if(type==TYPE_WHERE) {
return "KW:where";
}else if(type==TYPE_AND) {
return "KW:and";
}else if(type==TYPE_EQUAL) {
return "=";
}else if(type==TYPE_OR) {
return "KW:or";
}else if(type==TYPE_ORDER) {
return "KW:order";
}else if(type==TYPE_BY) {
return "KW:by";
}else if(type==TYPE_ASC) {
return "KW:asc";
}else if(type==TYPE_DESC) {
return "KW:desc";
}else if(type==TYPE_AS) {
return "KW:as";
}else if(type==TYPE_GROUP) {
return "KW:group";
}else if(type==TYPE_HAVING) {
return "KW:having";
}else if(type==TYPE_LESSTHAN) {
return "<";
}else if(type==TYPE_GREATERTHAN) {
return ">";
}else if(type==TYPE_OPEN_PARENTHESIS) {
return "(";
}else if(type==TYPE_CLOSE_PARENTHESIS) {
return ")";
}else if(type==TYPE_CONNECT) {
return "KW:connect";
}else if(type==TYPE_LESSTHAN_OR_EQUAL) {
return "<=";
}else if(type==TYPE_GREATERTHAN_OR_EQUAL) {
return ">=";
}else if(type==TYPE_LESSTHAN_OR_GREATERTHAN) {
return "<>";
}else if(type==TYPE_CASE) {
return "KW:case";
}else if(type==TYPE_WHEN) {
return "KW:when";
}else if(type==TYPE_THEN) {
return "KW:then";
}else if(type==TYPE_ELSE) {
return "KW:else";
}else if(type==TYPE_END) {
return "KW:end";
}else if(type==TYPE_IS) {
return "KW:is";
}else if(type==TYPE_NULL) {
return "KW:null";
}else if(type==TYPE_TRUE) {
return "KW:true";
}else if(type==TYPE_FALSE) {
return "KW:false";
}else if(type==TYPE_PLUS) {
return "+";
}else if(type==TYPE_MINUS) {
return "-";
}else if(type==TYPE_DEVIDE) {
return "/";
}else if(type==TYPE_DISTINCT) {
return "KW:distinct";
}else if(type==TYPE_OVER) {
return "KW:over";
}else if(type==TYPE_STRING_CONCAT) {
return "||";
}else if(type==TYPE_ON) {
return "KW:on";
}else if(type==TYPE_JOIN) {
return "KW:join";
}else if(type==TYPE_INNER) {
return "KW:inner";
}else if(type==TYPE_LEFT) {
return "KW:left";
}else if(type==TYPE_RIGHT) {
return "KW:right";
}else if(type==TYPE_OUTER) {
return "KW:outer";
}else if(type==TYPE_FULL) {
return "KW:full";
}else if(type==TYPE_WITHIN) {
return "KW:within";
}else if(type==TYPE_PARTITION) {
return "KW:partition";
} return null;
}
} public class Lexer {
private List<Token> tokenList; public Lexer(String inputSql) {
String sql=pretreat(inputSql);
String swallowed=""; tokenList=new ArrayList<Token>();
for(int i=0;i<sql.length();i++){
char c=sql.charAt(i); if(Character.isWhitespace(c)){
addTextToList(swallowed);
swallowed="";
}else if(c==','){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_COMMA));
}else if(c=='='){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_EQUAL));
}else if(c=='<'){
int next=i+1;
if(next<sql.length() && sql.charAt(next)=='=') {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token("<=",Token.TYPE_LESSTHAN_OR_EQUAL));
i++;
}else if(next<sql.length() && sql.charAt(next)=='>') {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token("<>",Token.TYPE_LESSTHAN_OR_GREATERTHAN));
i++;
}else {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_LESSTHAN));
}
}else if(c=='>'){
int next=i+1;
if(next<sql.length() && sql.charAt(next)=='=') {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(">=",Token.TYPE_GREATERTHAN_OR_EQUAL));
i++;
}else {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_GREATERTHAN));
}
}else if(c=='|'){
int next=i+1;
if(next<sql.length() && sql.charAt(next)=='|') {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token("||",Token.TYPE_STRING_CONCAT));
i++;
}
}else if(c=='('){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_OPEN_PARENTHESIS));
}else if(c==')'){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_CLOSE_PARENTHESIS));
}else if(c=='+'){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_PLUS));
}else if(c=='-'){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_MINUS));
}else if(c=='/'){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_DEVIDE));
}else {
swallowed+=c;
}
}
} private int findTypeByText(String text) {
Object[][] arr= {
{"select", Token.TYPE_SELECT},
{"from", Token.TYPE_FROM},
{"where", Token.TYPE_WHERE},
{"and", Token.TYPE_AND},
{"or", Token.TYPE_OR},
{"order", Token.TYPE_ORDER},
{"by", Token.TYPE_BY},
{"asc", Token.TYPE_ASC},
{"desc", Token.TYPE_ASC},
{"asc", Token.TYPE_DESC},
{"as", Token.TYPE_AS},
{"group", Token.TYPE_GROUP},
{"having", Token.TYPE_HAVING},
{"connect", Token.TYPE_CONNECT},
{"case", Token.TYPE_CASE},
{"when", Token.TYPE_WHEN},
{"then", Token.TYPE_THEN},
{"else", Token.TYPE_ELSE},
{"end", Token.TYPE_END},
{"is", Token.TYPE_IS},
{"null", Token.TYPE_NULL},
{"true", Token.TYPE_TRUE},
{"false", Token.TYPE_FALSE},
{"distinct", Token.TYPE_DISTINCT},
{"over", Token.TYPE_OVER},
{"on", Token.TYPE_ON},
{"join", Token.TYPE_JOIN},
{"inner", Token.TYPE_INNER},
{"left", Token.TYPE_LEFT},
{"right", Token.TYPE_RIGHT},
{"outer", Token.TYPE_OUTER},
{"full", Token.TYPE_FULL},
{"within", Token.TYPE_WITHIN},
{"partition", Token.TYPE_PARTITION},
}; for(Object[] arrInner:arr) {
String keyword=String.valueOf(arrInner[0]);
if(keyword.equalsIgnoreCase(text)) {
return Integer.parseInt(arrInner[1].toString());
}
} return Token.TYPE_TEXT;
} private void addTextToList(String text) {
int type=findTypeByText(text);
addToken2List(text,type);
} private void addToken2List(String text,int type) {
if(text.trim().length()>0) {
tokenList.add(new Token(text,type));
}
} public void printTokenList() {
final String continuousStar = createRepeatedStr("-", 84);
final String layout = "%-20s %-20s %-20s %-20s %s";
StringBuilder sb = new StringBuilder(); sb.append(String.format(layout, "Index", "Type No","Text","Type Desc","\n"));
sb.append(continuousStar + "\n");
int index=0;
for(Token token:tokenList) {
sb.append(String.format(layout, String.valueOf(index),String.valueOf(token.type), token.text,token.getTypeStr(),"\n"));
index++;
} System.out.println(sb.toString());
} private static String createRepeatedStr(String seed, int n) {
return String.join("", Collections.nCopies(n, seed));
} private String pretreat(String raw) {
return raw.trim()+" ";
} public static void main(String[] args) throws Exception{
String sql=removeExtraSpace(readSqlFromFile("c:\\temp\\12.sql"));
System.out.println(sql);
new Lexer(sql).printTokenList();;
} private static String readSqlFromFile(String filePath) throws Exception{
StringBuilder sb=new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF-8"));
String line = null;
while( ( line = br.readLine() ) != null ) {
sb.append(line);
}
br.close();
return sb.toString();
} private static String removeExtraSpace(String raw) {
return raw.replaceAll("\\s{2,}", " ");
}
}
--2020年5月13日 16点07分--
SQL分词器1.10版的更多相关文章
- elasticsearch安装ik分词器(极速版)
简介:下面讲有我已经打包并且编辑过的zip包,你可以在下面下载即可. 1.下载zip包.elasticsearch-analysis-ik-1.8.0.jar下面有附件链接[ik-安装包.zip],下 ...
- elasticsearch 6.2.4 安装 elasticsearch-analysis-ik 分词器 (windows 10下)
访问 https://github.com/medcl/elasticsearch-analysis-ik 找 releases 找到对应的 es 版本 下载 elasticsearch-analy ...
- 用lucene4.10.2分词器进行分词
import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.luc ...
- Solr4.10与tomcat整合并安装中文分词器
1.solr Solr 是Apache下的一个顶级开源项目,采用Java开发,它是基于Lucene的全文搜索服务器.Solr提供了比Lucene更为丰富的查询语言,同时实现了可配置.可扩展,并对索引. ...
- Elasticsearch教程(三),IK分词器安装 (极速版)
如果只想快速安装IK,本教程管用.下面看经过. 简介: 下面讲有我已经打包并且编辑过的zip包,你可以在下面下载即可. 当前讲解的IK分词器 包的 version 为1.8. 一.下载zip包. 下面 ...
- 5.Solr4.10.3中配置中文分词器
转载请出自出处:http://www.cnblogs.com/hd3013779515/ 1.下载IK Analyzer 2012FF_hf1.zip并上传到/home/test 2.按照如下命令安装 ...
- Elasticsearch(10) --- 内置分词器、中文分词器
Elasticsearch(10) --- 内置分词器.中文分词器 这篇博客主要讲:分词器概念.ES内置分词器.ES中文分词器. 一.分词器概念 1.Analysis 和 Analyzer Analy ...
- 11大Java开源中文分词器的使用方法和分词效果对比,当前几个主要的Lucene中文分词器的比较
本文的目标有两个: 1.学会使用11大Java开源中文分词器 2.对比分析11大Java开源中文分词器的分词效果 本文给出了11大Java开源中文分词的使用方法以及分词结果对比代码,至于效果哪个好,那 ...
- Lucene.Net+盘古分词器(详细介绍)(转)
出处:http://www.cnblogs.com/magicchaiy/archive/2013/06/07/LuceneNet%E7%9B%98%E5%8F%A4%E5%88%86%E8%AF%8 ...
随机推荐
- NIO(三):Selector选择器
一.堵塞式与非堵塞式 在传统IO中,将数据由当前线程从客户端传入服务端,由服务端的内核进行判断传过来的数据是否合法,内核中是否存在数据. 如果不存在数据 ,并且数据并不合法,当前线程将会堵塞等待.当前 ...
- GitLab 查看版本号
cat /opt/gitlab/embedded/service/gitlab-rails/VERSION
- 基于Logistic回归和sigmoid函数的分类算法推导
此部分内容是对机器学习实战一书的第五章的数学推导,主要是对5.2节代码实现中,有一部分省去了相关的公式推导,这里进行了推导,后续会将算法进行java实现.此部分同样因为公式较多,采用手写推导,拍照记录 ...
- JS学习第九天
DOM创建表格: insertRow(index) 在指定索引位置插入一行 deleteRow(index) 删除表格中index索引处的行 insertCell(index) 在index处创建一个 ...
- HTTP PUT/POST/PATCH的区别
幂等 idempotent 幂等操作的特定就是其任意多次执行所产生的影响与一次执行的影响相同 POST POST 用来创建一个新的数据 POST 不是幂等的, 意味着多次操作的结果是不同的, 多次操作 ...
- 解决 EntityFrameworkCore tool 3.1 init.ps1 is not recognized 问题
昨天将项目升级到.net core 3.1后, 打开vs2019 Package Manager Console后突然发生了错误,如下 最终导致EntityFramework 迁移相关命令都不能正常使 ...
- 源码剖析Springboot自定义异常
博主看到新服务是封装的自定义异常,准备入手剖析一下,自定义的异常是如何进行抓住我们请求的方法的异常,并进行封装返回到.废话不多说,先看看如何才能实现封装异常,先来一个示例: @ControllerAd ...
- 关于数据库新建用户提示“用户、组或角色‘’XXX‘’在当前数据库中已已存在”的解决办法
一般在还原数据库后,给这个数据库添加一个登录名时出现. 例如数据库备份文件中已经包含了用户abc,现在还原了数据库,然后发现现有数据库中没有abc这个用户,想要新建一个abc用户,作为该数据库的own ...
- Oracle 多条数据转一行逗号隔开
wm_concat 例: select wm_concat(市) from pa50 where apa132=省
- 【Flutter 实战】一文学会20多个动画组件
老孟导读:此篇文章是 Flutter 动画系列文章第三篇,后续还有动画序列.过度动画.转场动画.自定义动画等. Flutter 系统提供了20多个动画组件,只要你把前面[动画核心](文末有链接)的文章 ...