From ba91209e4e1b895b795b578aab40231141597068 Mon Sep 17 00:00:00 2001 From: David Muir Sharnoff Date: Tue, 29 Mar 2022 21:46:36 -0700 Subject: [PATCH] sqltoken is now its own repo --- go.mod | 1 + go.sum | 10 + lsmysql/check.go | 2 +- sqltoken/tokenize.go | 739 -------------------------------------- sqltoken/tokenize_test.go | 731 ------------------------------------- 5 files changed, 12 insertions(+), 1471 deletions(-) delete mode 100644 sqltoken/tokenize.go delete mode 100644 sqltoken/tokenize_test.go diff --git a/go.mod b/go.mod index 1810c66..073cad1 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.16 require ( github.com/go-sql-driver/mysql v1.5.0 github.com/lib/pq v1.10.2 + github.com/muir/sqltoken v0.0.4 github.com/muir/testinglogur v0.0.0-20210705185900-bc47cbaaadca github.com/pkg/errors v0.9.1 github.com/stretchr/testify v1.7.0 diff --git a/go.sum b/go.sum index 25aaba3..6575572 100644 --- a/go.sum +++ b/go.sum @@ -1,11 +1,15 @@ +github.com/alvaroloes/enumer v1.1.2/go.mod h1:FxrjvuXoDAx9isTJrv4c+T410zFi0DtXIT0m65DJ+Wo= github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs= github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/lib/pq v1.10.2 h1:AqzbZs4ZoCBp+GtejcpCpcxM3zlSMx29dXbUSeVtJb8= github.com/lib/pq v1.10.2/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/muir/sqltoken v0.0.4 h1:SioNnG90ZYXmlfnPaUxUdNC1dFkhKL64pDeS+wXZ8k8= +github.com/muir/sqltoken v0.0.4/go.mod h1:6hPsZxszMpYyNf12og4f4VShFo/Qipz6Of0cn5KGAAU= github.com/muir/testinglogur v0.0.0-20210705185900-bc47cbaaadca h1:umBSRx6i2/+1gbab8wlghfL7vPhBGr8ZwlKlo1nRg04= github.com/muir/testinglogur v0.0.0-20210705185900-bc47cbaaadca/go.mod h1:18iL5fVrQ2hu0NeXKtEE9pS5jgdaNTgqWHNl+p33g6M= +github.com/pascaldekloe/name v0.0.0-20180628100202-0fd16699aae1/go.mod h1:eD5JxqMiuNYyFNmyY9rkJ/slN8y59oEu4Ei7F8OoKWQ= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -13,6 +17,12 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/tools v0.0.0-20190524210228-3d17549cdc6b/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= diff --git a/lsmysql/check.go b/lsmysql/check.go index 8c0e41b..0b79ac4 100644 --- a/lsmysql/check.go +++ b/lsmysql/check.go @@ -4,7 +4,7 @@ import ( "regexp" "strings" - "github.com/muir/libschema/sqltoken" + "github.com/muir/sqltoken" ) type CheckResult string diff --git a/sqltoken/tokenize.go b/sqltoken/tokenize.go deleted file mode 100644 index dc4ca56..0000000 --- a/sqltoken/tokenize.go +++ /dev/null @@ -1,739 +0,0 @@ -package sqltoken - -import ( - "fmt" - "strings" - "unicode" - "unicode/utf8" -) - -type TokenType string - -const ( - Comment TokenType = "comment" - Whitespace = "whitespace" - QuestionMark = "questionMark" // used in MySQL substitution - DollarNumber = "dollarNumber" // used in PostgreSQL substitution - ColonWord = "colonWord" // used in sqlx substitution - Literal = "literal" // strings - Number = "number" - Semicolon = "semicolon" - Punctuation = "punctuation" - Word = "word" - Other = "other" // control characters and other non-printables -) - -func combineOkay(t TokenType) bool { - switch t { - case Number, QuestionMark, DollarNumber, ColonWord: - return false - } - return true -} - -type Token struct { - Type TokenType - Text string -} - -// Config specifies the behavior of Tokenize as relates to behavior -// that differs between SQL implementations -type Config struct { - // Tokenize ? as type Question (used by MySQL) - NoticeQuestionMark bool - - // Tokenize $7 as type DollarNumber (PostgreSQL) - NoticeDollarNumber bool - - // Tokenize :word as type ColonWord (sqlx) - NoticeColonWord bool - - // Tokenize # as type comment (MySQL) - NoticeHashComment bool - - // $q$ stuff $q$ and $$stuff$$ quoting (PostgreSQL) - NoticeDollarQuotes bool - - // NoticeHexValues 0xa0 x'af' X'AF' (MySQL) - NoticeHexNumbers bool - - // NoticeBinaryValues 0x01 b'01' B'01' (MySQL) - NoticeBinaryNumbers bool - - // NoticeUAmpPrefix U& utf prefix U&"\0441\043B\043E\043D" (PostgreSQL) - NoticeUAmpPrefix bool - - // NoticeCharsetLiteral _latin1'string' n'string' (MySQL) - NoticeCharsetLiteral bool -} - -type Tokens []Token - -type TokensList []Tokens - -func MySQLConfig() Config { - return Config{ - NoticeQuestionMark: true, - NoticeHashComment: true, - NoticeHexNumbers: true, - NoticeBinaryNumbers: true, - NoticeCharsetLiteral: true, - } -} -func PostgreSQLConfig() Config { - return Config{ - NoticeDollarNumber: true, - NoticeDollarQuotes: true, - NoticeUAmpPrefix: true, - } -} - -func TokenizeMySQL(s string) Tokens { - return Tokenize(s, MySQLConfig()) -} - -func TokenizePostgreSQL(s string) Tokens { - return Tokenize(s, PostgreSQLConfig()) -} - -const debug = false - -// Tokenize breaks up SQL strings into Token objects. No attempt is made -// to break successive punctuation. -func Tokenize(s string, config Config) Tokens { - if len(s) == 0 { - return []Token{} - } - tokens := make([]Token, 0, len(s)/5) - tokenStart := 0 - var i int - var firstDollarEnd int - - // Why is this written with Goto you might ask? It's written - // with goto because RE2 can't handle complex regex and PCRE - // has external dependencies and thus isn't friendly for libraries. - // So, it could have had a switch with a state variable, but that's - // just a way to do goto that's lower performance. Might as - // well do goto the natural way. - - token := func(t TokenType) { - if debug { - fmt.Printf("> %s: {%s}\n", t, s[tokenStart:i]) - } - if i-tokenStart == 0 { - return - } - if len(tokens) > 0 && tokens[len(tokens)-1].Type == t && combineOkay(t) { - tokens[len(tokens)-1].Text += s[tokenStart:i] - } else { - tokens = append(tokens, Token{ - Type: t, - Text: s[tokenStart:i], - }) - } - tokenStart = i - } - -BaseState: - for i < len(s) { - c := s[i] - i++ - switch c { - case '/': - if i < len(s) && s[i] == '*' { - goto CStyleComment - } - token(Punctuation) - case '\'': - goto SingleQuoteString - case '"': - goto DoubleQuoteString - case '-': - if i < len(s) && s[i] == '-' { - goto SkipToEOL - } - token(Punctuation) - case '#': - if config.NoticeHashComment { - goto SkipToEOL - } - token(Punctuation) - case ';': - token(Semicolon) - case '?': - if config.NoticeQuestionMark { - token(QuestionMark) - } else { - token(Punctuation) - } - case ' ', '\n', '\r', '\t', '\b', '\v', '\f': - goto Whitespace - case '.': - goto PossibleNumber - case '~', '`', '!', '%', '^', '&', '*', '(', ')', '+', '=', '{', '}', '[', ']', - '|', '\\', ':', '<', '>', ',': - token(Punctuation) - case '$': - // $1 - // $seq$ stuff $seq$ - // $$stuff$$ - if config.NoticeDollarQuotes || config.NoticeDollarNumber { - goto Dollar - } - token(Punctuation) - case 'U': - // U&'d\0061t\+000061' - if config.NoticeUAmpPrefix && i+1 < len(s) && s[i] == '&' && s[i+1] == '\'' { - i += 2 - goto SingleQuoteString - } - goto Word - case 'x', 'X': - // X'1f' x'1f' - if config.NoticeHexNumbers && i < len(s) && s[i] == '\'' { - i++ - goto QuotedHexNumber - } - goto Word - case 'b', 'B': - if config.NoticeBinaryNumbers && i < len(s) && s[i] == '\'' { - i++ - goto QuotedBinaryNumber - } - goto Word - case 'a' /*b*/, 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', - 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w' /*x*/, 'y', 'z', - 'A' /*B*/, 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', - 'N', 'O', 'P', 'Q', 'R', 'S', 'T' /*U*/, 'V', 'W' /*X*/, 'Y', 'Z', - '_': - // This covers the entire alphabet except specific letters that have - // been handled above. This case is actually just a performance - // hack: if there were a letter missing it would be caught below - // by unicode.IsLetter() - goto Word - case '0': - if config.NoticeHexNumbers && i < len(s) && s[i] == 'x' { - i++ - goto HexNumber - } - if config.NoticeBinaryNumbers && i < len(s) && s[i] == 'b' { - i++ - goto BinaryNumber - } - goto Number - case /*0*/ '1', '2', '3', '4', '5', '6', '7', '8', '9': - goto Number - default: - r, w := utf8.DecodeRuneInString(s[i-1:]) - switch { - case r == '⎖': - // "⎖" is the unicode decimal separator -- an alternative to "." - i += w - 1 - goto NumberNoDot - case unicode.IsDigit(r): - i += w - 1 - goto Number - case unicode.IsPunct(r) || unicode.IsSymbol(r) || unicode.IsMark(r): - i += w - 1 - token(Punctuation) - case unicode.IsLetter(r): - i += w - 1 - goto Word - case unicode.IsControl(r) || unicode.IsSpace(r): - i += w - 1 - goto Whitespace - default: - i += w - 1 - token(Other) - } - } - } - goto Done - -CStyleComment: - for i < len(s) { - c := s[i] - i++ - switch c { - case '*': - if i < len(s) && s[i] == '/' { - i++ - token(Comment) - goto BaseState - } - } - } - token(Comment) - goto Done - -SingleQuoteString: - for i < len(s) { - c := s[i] - i++ - switch c { - case '\'': - token(Literal) - goto BaseState - case '\\': - if i < len(s) { - i++ - } else { - token(Literal) - goto Done - } - } - } - token(Literal) - goto Done - -DoubleQuoteString: - for i < len(s) { - c := s[i] - i++ - switch c { - case '"': - token(Literal) - goto BaseState - case '\\': - if i < len(s) { - i++ - } else { - token(Literal) - goto Done - } - } - } - token(Literal) - goto Done - -SkipToEOL: - for i < len(s) { - c := s[i] - i++ - switch c { - case '\n': - token(Comment) - goto BaseState - } - } - token(Comment) - goto Done - -Word: - for i < len(s) { - c := s[i] - switch c { - case 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', - 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', - 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - '_', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - // This covers the entire alphabet and numbers. - // This case is actually just a performance - // hack: if there were a letter missing it would be caught below - // by unicode.IsLetter() - i++ - continue - case '\'': - if config.NoticeCharsetLiteral { - switch s[tokenStart] { - case 'n', 'N': - if i-tokenStart == 1 { - i++ - goto SingleQuoteString - } - case '_': - i++ - goto SingleQuoteString - } - } - } - r, w := utf8.DecodeRuneInString(s[i:]) - if unicode.IsLetter(r) || unicode.IsDigit(r) { - i += w - continue - } - token(Word) - goto BaseState - } - token(Word) - goto Done - -PossibleNumber: - if i < len(s) { - c := s[i] - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - i++ - goto NumberNoDot - default: - r, w := utf8.DecodeRuneInString(s[i:]) - i += w - if unicode.IsDigit(r) { - goto NumberNoDot - } - token(Punctuation) - goto BaseState - } - } - token(Punctuation) - goto Done - -Number: - for i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - // okay - case '.': - goto NumberNoDot - case 'e', 'E': - if i < len(s) { - switch s[i] { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - i++ - goto Exponent - } - r, w := utf8.DecodeRuneInString(s[i:]) - if unicode.IsDigit(r) { - i += w - goto Exponent - } - } - i-- - token(Number) - goto Word - default: - r, w := utf8.DecodeRuneInString(s[i-1:]) - if r == '⎖' { - i += w - 1 - goto NumberNoDot - } - if !unicode.IsDigit(r) { - i-- - token(Number) - goto BaseState - } - i += w - 1 - } - } - token(Number) - goto Done - -NumberNoDot: - for i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - // okay - case 'e', 'E': - if i < len(s) { - switch s[i] { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - i++ - goto Exponent - } - } - i-- - token(Number) - goto Word - default: - r, w := utf8.DecodeRuneInString(s[i-1:]) - if !unicode.IsDigit(r) { - i-- - token(Number) - goto BaseState - } - i += w - 1 - } - } - token(Number) - goto Done - -Exponent: - if i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - goto ExponentConfirmed - default: - r, w := utf8.DecodeRuneInString(s[i-1:]) - if !unicode.IsDigit(r) { - i -= 1 - token(Number) - goto BaseState - } - i += w - 1 - goto ExponentConfirmed - } - } - token(Number) - goto BaseState - -ExponentConfirmed: - for i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - // okay - default: - r, w := utf8.DecodeRuneInString(s[i-1:]) - if !unicode.IsDigit(r) { - i-- - token(Number) - goto BaseState - } - i += w - 1 - } - } - token(Number) - goto Done - -HexNumber: - for i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', - 'a', 'b', 'c', 'd', 'e', 'f', - 'A', 'B', 'C', 'D', 'E', 'F': - // okay - default: - i-- - token(Number) - goto BaseState - } - } - token(Number) - goto Done - -BinaryNumber: - for i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1': - // okay - default: - i-- - token(Number) - goto BaseState - } - } - token(Number) - goto Done - -Whitespace: - for i < len(s) { - c := s[i] - i++ - switch c { - case ' ', '\n', '\r', '\t', '\b', '\v', '\f': - default: - r, w := utf8.DecodeRuneInString(s[i-1:]) - if !unicode.IsSpace(r) && !unicode.IsControl(r) { - i-- - token(Whitespace) - goto BaseState - } - i += w - 1 - } - } - token(Whitespace) - goto Done - -QuotedHexNumber: - for i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', - 'a', 'b', 'c', 'd', 'e', 'f', - 'A', 'B', 'C', 'D', 'E', 'F': - // okay - case '\'': - token(Number) - goto BaseState - default: - i-- - token(Number) - goto BaseState - } - } - token(Number) - goto Done - -QuotedBinaryNumber: - for i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1': - // okay - case '\'': - token(Number) - goto BaseState - default: - i-- - token(Number) - goto BaseState - } - } - token(Number) - goto Done - -Dollar: - // $1 - // $seq$ stuff $seq$ - // $$stuff$$ - firstDollarEnd = i - if i < len(s) { - c := s[i] - if config.NoticeDollarQuotes { - if c == '$' { - e := strings.Index(s[i+1:], "$$") - if e == -1 { - i = firstDollarEnd - token(Punctuation) - goto BaseState - } - i += 3 + e - token(Literal) - goto BaseState - } - r, w := utf8.DecodeRuneInString(s[i:]) - if unicode.IsLetter(r) { - i += w - for i < len(s) { - c := s[i] - r, w := utf8.DecodeRuneInString(s[i:]) - i++ - if c == '$' { - endToken := s[tokenStart:i] - e := strings.Index(s[i:], endToken) - if e == -1 { - i = firstDollarEnd - token(Punctuation) - goto BaseState - } - i += e + len(endToken) - token(Literal) - goto BaseState - } else if unicode.IsLetter(r) { - i += w - 1 - continue - } else { - i = firstDollarEnd - token(Punctuation) - goto BaseState - } - } - } - } - if config.NoticeDollarNumber { - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - i++ - for i < len(s) { - c := s[i] - i++ - switch c { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - continue - } - i-- - break - } - token(DollarNumber) - goto BaseState - } - } - token(Punctuation) - goto BaseState - } - token(Punctuation) - goto Done - -Done: - return tokens -} - -func (ts Tokens) String() string { - if len(ts) == 0 { - return "" - } - strs := make([]string, len(ts)) - for i, t := range ts { - strs[i] = t.Text - } - return strings.Join(strs, "") -} - -// Strip removes leading/trailing whitespace and semicolors -// and strips all internal comments. Internal whitespace -// is changed to a single space. -func (ts Tokens) Strip() Tokens { - i := 0 - for i < len(ts) { - switch ts[i].Type { - case Comment, Whitespace, Semicolon: - i++ - continue - } - break - } - c := make(Tokens, 0, len(ts)) - var lastReal int - for i < len(ts) { - switch ts[i].Type { - case Comment: - continue - case Whitespace: - c = append(c, Token{ - Type: Whitespace, - Text: " ", - }) - case Semicolon: - c = append(c, ts[i]) - default: - c = append(c, ts[i]) - lastReal = len(c) - } - i++ - } - c = c[:lastReal] - return c -} - -// CmdSplit breaks up the token array into multiple token arrays, -// one per command (splitting on ";") -func (ts Tokens) CmdSplit() TokensList { - var r TokensList - start := 0 - for i, t := range ts { - if t.Type == Semicolon { - r = append(r, Tokens(ts[start:i]).Strip()) - start = i + 1 - } - } - if start < len(ts) { - r = append(r, Tokens(ts[start:]).Strip()) - } - return r -} - -func (tl TokensList) Strings() []string { - r := make([]string, 0, len(tl)) - for _, ts := range tl { - s := ts.String() - if s != "" { - r = append(r, s) - } - } - return r -} diff --git a/sqltoken/tokenize_test.go b/sqltoken/tokenize_test.go deleted file mode 100644 index 66add2c..0000000 --- a/sqltoken/tokenize_test.go +++ /dev/null @@ -1,731 +0,0 @@ -package sqltoken - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -var commonCases = []Tokens{ - {}, - { - {Type: Word, Text: "c01"}, - }, - { - {Type: Word, Text: "c02"}, - {Type: Semicolon, Text: ";"}, - {Type: Word, Text: "morestuff"}, - }, - { - {Type: Word, Text: "c03"}, - {Type: Comment, Text: "--cmt;\n"}, - {Type: Word, Text: "stuff2"}, - }, - { - {Type: Word, Text: "c04"}, - {Type: Punctuation, Text: "-"}, - {Type: Word, Text: "an"}, - {Type: Punctuation, Text: "-"}, - {Type: Word, Text: "dom"}, - }, - { - {Type: Word, Text: "c05_singles"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "''"}, - {Type: Whitespace, Text: " \t"}, - {Type: Literal, Text: "'\\''"}, - {Type: Semicolon, Text: ";"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "';\\''"}, - }, - { - {Type: Word, Text: "c06_doubles"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: `""`}, - {Type: Whitespace, Text: " \t"}, - {Type: Literal, Text: `"\""`}, - {Type: Semicolon, Text: ";"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: `";\""`}, - }, - { - {Type: Word, Text: "c07_singles"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-"}, - {Type: Literal, Text: "''"}, - {Type: Whitespace, Text: " \t"}, - {Type: Punctuation, Text: "-"}, - {Type: Literal, Text: "'\\''"}, - {Type: Semicolon, Text: ";"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-"}, - {Type: Literal, Text: "';\\''"}, - }, - { - {Type: Word, Text: "c08_doubles"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-"}, - {Type: Literal, Text: `""`}, - {Type: Whitespace, Text: " \t"}, - {Type: Punctuation, Text: "-"}, - {Type: Literal, Text: `"\""`}, - {Type: Semicolon, Text: ";"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-"}, - {Type: Literal, Text: `";\""`}, - }, - { - {Type: Word, Text: "c09"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "r"}, - {Type: Punctuation, Text: "-"}, - {Type: Word, Text: "an"}, - {Type: Punctuation, Text: "-"}, - {Type: Word, Text: "dom"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: `";;"`}, - {Type: Semicolon, Text: ";"}, - {Type: Literal, Text: "';'"}, - {Type: Punctuation, Text: "-"}, - {Type: Literal, Text: `";"`}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-"}, - {Type: Literal, Text: "';'"}, - {Type: Punctuation, Text: "-"}, - }, - { - {Type: Word, Text: "c10"}, - {Type: Punctuation, Text: "-//"}, - }, - { - {Type: Word, Text: "c11"}, - {Type: Punctuation, Text: "-//-/-"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c12"}, - {Type: Punctuation, Text: "/"}, - {Type: Literal, Text: `";"`}, - {Type: Whitespace, Text: "\r\n"}, - {Type: Literal, Text: `";"`}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-/"}, - {Type: Literal, Text: `";"`}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c13"}, - {Type: Punctuation, Text: "/"}, - {Type: Literal, Text: "';'"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "';'"}, - {Type: Whitespace, Text: " "}, - {Type: Comment, Text: "/*;*/"}, - {Type: Punctuation, Text: "-/"}, - {Type: Literal, Text: "';'"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c14"}, - {Type: Punctuation, Text: "-"}, - {Type: Comment, Text: "/*;*/"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-/"}, - {Type: Comment, Text: "/*\n\t;*/"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c15"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: ".5"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c16"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: ".5"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "0.5"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "30.5"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "40"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "40.13"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "40.15e8"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "40e8"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: ".4e8"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: ".4e20"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c17"}, - {Type: Whitespace, Text: " "}, - {Type: Comment, Text: "/* foo \n */"}, - }, - { - {Type: Word, Text: "c18"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "'unterminated "}, - }, - { - {Type: Word, Text: "c19"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: `"unterminated `}, - }, - { - {Type: Word, Text: "c20"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: `'unterminated \`}, - }, - { - {Type: Word, Text: "c21"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: `"unterminated \`}, - }, - { - {Type: Word, Text: "c22"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: ".@"}, - }, - { - {Type: Word, Text: "c23"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: ".@"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c24"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "7"}, - {Type: Word, Text: "ee"}, - }, - { - {Type: Word, Text: "c25"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "7"}, - {Type: Word, Text: "eg"}, - }, - { - {Type: Word, Text: "c26"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "7"}, - {Type: Word, Text: "ee"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c27"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "7"}, - {Type: Word, Text: "eg"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c28"}, - {Type: Whitespace, Text: " "}, - {Type: Comment, Text: "/* foo "}, - }, - { - {Type: Word, Text: "c29"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "7e8"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c30"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "7e8"}, - }, - { - {Type: Word, Text: "c31"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "7.0"}, - {Type: Word, Text: "e"}, - }, - { - {Type: Word, Text: "c32"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "7.0"}, - {Type: Word, Text: "e"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c33"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "eèҾ"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "ҾeèҾ"}, - }, - { - {Type: Word, Text: "c34"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "⁖"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "+⁖"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "+⁖*"}, - }, - { - {Type: Word, Text: "c35"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "๒"}, - }, - { - {Type: Word, Text: "c36"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "๒"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c37"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "๒⎖๒"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c38"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "⎖๒"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c39"}, - {Type: Whitespace, Text: " "}, - {Type: Comment, Text: "-- comment w/o end"}, - }, - { - {Type: Word, Text: "c40"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: ".๒"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c40"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "abnormal"}, - {Type: Whitespace, Text: " "}, // this is a unicode space character - {Type: Word, Text: "space"}, - }, - { - {Type: Word, Text: "c41"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "abnormal"}, - {Type: Whitespace, Text: "  "}, // this is a unicode space character - {Type: Word, Text: "space"}, - }, - { - {Type: Word, Text: "c42"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "abnormal"}, - {Type: Whitespace, Text: "  "}, // this is a unicode space character - {Type: Word, Text: "space"}, - }, - { - {Type: Word, Text: "c43"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "."}, - }, - { - {Type: Word, Text: "c44"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "๒๒"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c45"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "3e๒๒๒"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "c46"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "3.7"}, - }, - { - {Type: Word, Text: "c47"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "3.7e19"}, - }, - { - {Type: Word, Text: "c48"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "3.7e2"}, - }, - { - {Type: Word, Text: "c49"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "😀"}, // I'm not sure I agree with the classification - }, - { - {Type: Word, Text: "c50"}, - {Type: Whitespace, Text: " \x00"}, - }, - { - {Type: Word, Text: "c51"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "x"}, - {Type: Whitespace, Text: "\x00"}, - }, - { - {Type: Comment, Text: "-- c52\n"}, - }, - { - {Type: Word, Text: "c53"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "z"}, - {Type: Literal, Text: "'not a prefixed literal'"}, - }, -} - -var mySQLCases = []Tokens{ - { - {Type: Word, Text: "m01"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-"}, - {Type: Comment, Text: "# /# #;\n"}, - {Type: Whitespace, Text: "\t"}, - {Type: Word, Text: "foo"}, - }, - { - {Type: Word, Text: "m02"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "'#;'"}, - {Type: Punctuation, Text: ","}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: `"#;"`}, - {Type: Punctuation, Text: ","}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "-"}, - {Type: Comment, Text: "# /# #;\n"}, - {Type: Whitespace, Text: "\t"}, - {Type: Word, Text: "foo"}, - }, - { - {Type: Word, Text: "m03"}, - {Type: Whitespace, Text: " "}, - {Type: QuestionMark, Text: "?"}, - {Type: QuestionMark, Text: "?"}, - }, - { - {Type: Word, Text: "m04"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "$"}, - {Type: Number, Text: "5"}, - }, - { - {Type: Word, Text: "m05"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "U"}, - {Type: Punctuation, Text: "&"}, - {Type: Literal, Text: `'d\0061t\+000061'`}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "m06"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "0x1f"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "x'1f'"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "X'1f'"}, - }, - { - {Type: Word, Text: "m07"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "0b01"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "b'010'"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "B'110'"}, - }, - { - {Type: Word, Text: "m08"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "0b01"}, - }, - { - {Type: Word, Text: "m09"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "0x01"}, - }, - { - {Type: Word, Text: "m10"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "x'1f"}, - {Type: Punctuation, Text: "&"}, - }, - { - {Type: Word, Text: "m10"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "b'1"}, - {Type: Number, Text: "7"}, - }, - { - {Type: Word, Text: "m11"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "$$"}, - {Type: Word, Text: "footext"}, - {Type: Punctuation, Text: "$$"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "m12"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "b'10"}, - }, - { - {Type: Word, Text: "m13"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "x'1f"}, - }, - { - {Type: Word, Text: "m14"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "n'national charset'"}, - }, - { - {Type: Word, Text: "m14"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "_utf8'redundent'"}, - }, -} - -var postgreSQLCases = []Tokens{ - { - {Type: Word, Text: "p01"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "#"}, - {Type: Word, Text: "foo"}, - {Type: Whitespace, Text: "\n"}, - }, - { - {Type: Word, Text: "p02"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "?"}, - {Type: Whitespace, Text: "\n"}, - }, - { - {Type: Word, Text: "p03"}, - {Type: Whitespace, Text: " "}, - {Type: DollarNumber, Text: "$17"}, - {Type: DollarNumber, Text: "$8"}, - }, - { - {Type: Word, Text: "p04"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: `U&'d\0061t\+000061'`}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "p05"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "0"}, - {Type: Word, Text: "x1f"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "x"}, - {Type: Literal, Text: "'1f'"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "X"}, - {Type: Literal, Text: "'1f'"}, - }, - { - {Type: Word, Text: "p06"}, - {Type: Whitespace, Text: " "}, - {Type: Number, Text: "0"}, - {Type: Word, Text: "b01"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "b"}, - {Type: Literal, Text: "'010'"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "B"}, - {Type: Literal, Text: "'110'"}, - }, - { - {Type: Word, Text: "p07"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "$$footext$$"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "p08"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "$$foo!text$$"}, - }, - { - {Type: Word, Text: "p09"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "$q$foo$$text$q$"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "p10"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "$q$foo$$text$q$"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "p11"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "$$"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "p12"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "$$"}, - }, - { - {Type: Word, Text: "p13"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "$"}, - {Type: Word, Text: "q"}, - {Type: Punctuation, Text: "$"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "p14"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "$ҾeèҾ$ $ DLa 32498 $ҾeèҾ$"}, - {Type: Punctuation, Text: "$"}, - }, - { - {Type: Word, Text: "p15"}, - {Type: Whitespace, Text: " "}, - {Type: Literal, Text: "$ҾeèҾ$ $ DLa 32498 $ҾeèҾ$"}, - }, - { - {Type: Word, Text: "p16"}, - {Type: Whitespace, Text: " "}, - {Type: Punctuation, Text: "$"}, - {Type: Word, Text: "foo"}, - {Type: Punctuation, Text: "-$"}, - {Type: Word, Text: "bar"}, - {Type: Punctuation, Text: "$"}, - {Type: Word, Text: "foo"}, - {Type: Punctuation, Text: "-$"}, - {Type: Whitespace, Text: " "}, - }, - { - {Type: Word, Text: "p16"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "n"}, - {Type: Literal, Text: "'mysql only'"}, - }, - { - {Type: Word, Text: "p16"}, - {Type: Whitespace, Text: " "}, - {Type: Word, Text: "_utf8"}, - {Type: Literal, Text: "'mysql only'"}, - }, -} - -func doTestInner(t *testing.T, tc Tokens, f func(string) Tokens) { - text := tc.String() - t.Log("---------------------------------------") - t.Log(text) - t.Log("-----------------") - got := f(text) - require.Equal(t, text, got.String(), tc.String()) - require.Equal(t, tc, got, tc.String()) -} - -func doTest(t *testing.T, tc Tokens, f func(string) Tokens) { - if len(tc) == 0 { - t.Run("null", func(t *testing.T) { - doTestInner(t, tc, f) - return - }) - return - } - t.Run(tc[0].Text, func(t *testing.T) { - doTestInner(t, tc, f) - return - }) -} - -func testMySQL(t *testing.T, tc Tokens) { - doTest(t, tc, func(s string) Tokens { - return TokenizeMySQL(s) - }) -} - -func testPostgreSQL(t *testing.T, tc Tokens) { - doTest(t, tc, func(s string) Tokens { - return TokenizePostgreSQL(s) - }) -} - -func TestMySQLTokenizing(t *testing.T) { - for _, tc := range commonCases { - testMySQL(t, tc) - } - for _, tc := range mySQLCases { - testMySQL(t, tc) - } -} - -func TestPostgresSQLTokenizing(t *testing.T) { - for _, tc := range commonCases { - testPostgreSQL(t, tc) - } - for _, tc := range postgreSQLCases { - testPostgreSQL(t, tc) - } -} - -func TestStrip(t *testing.T) { - cases := []struct { - before string - after string - }{ - { - before: "", - after: "", - }, - { - before: "-- stuff\n", - after: "", - }, - { - before: " /* foo */ bar \n baz ; ", - after: "bar baz", - }, - } - for _, tc := range cases { - ts := TokenizeMySQL(tc.before) - require.Equal(t, tc.after, ts.Strip().String(), tc.before) - } -} - -func TestCmdSplit(t *testing.T) { - cases := []struct { - input string - want []string - }{ - { - input: "", - want: []string{}, - }, - { - input: "-- stuff\n", - want: []string{}, - }, - { - input: " /* foo */ bar \n baz ; ", - want: []string{"bar baz"}, - }, - { - input: " /* foo */ bar \n ;baz ; ", - want: []string{"bar", "baz"}, - }, - } - for _, tc := range cases { - ts := TokenizeMySQL(tc.input) - require.Equal(t, tc.want, ts.CmdSplit().Strings(), tc.input) - } -}