|
|
@@ -7,6 +7,7 @@ import (
|
|
|
_ "regexp"
|
|
|
"strings"
|
|
|
"unicode"
|
|
|
+ "unicode/utf8"
|
|
|
)
|
|
|
|
|
|
// TokenType represents the type of a lexical token
|
|
|
@@ -38,6 +39,11 @@ const (
|
|
|
TokenRL // Right Left
|
|
|
TokenLR // Left Right
|
|
|
|
|
|
+ // State diagram action keywords
|
|
|
+ TokenEntry // entry
|
|
|
+ TokenExit // exit
|
|
|
+ TokenDo // do
|
|
|
+
|
|
|
// Identifiers and literals
|
|
|
TokenID
|
|
|
TokenString
|
|
|
@@ -67,11 +73,29 @@ const (
|
|
|
TokenArrowCircle // --o
|
|
|
|
|
|
// ER diagram relationship tokens
|
|
|
- TokenEROneToMany // ||--o{
|
|
|
- TokenERManyToOne // }o--||
|
|
|
- TokenEROneToOne // ||--||
|
|
|
- TokenERManyToMany // }o--o{
|
|
|
- TokenERZeroToOne // ||--o|
|
|
|
+ TokenEROneToMany // ||--o{
|
|
|
+ TokenEROneToManyAlt // ||--|{
|
|
|
+ TokenERManyToOne // }o--||
|
|
|
+ TokenEROneToOne // ||--||
|
|
|
+ TokenERManyToMany // }o--o{
|
|
|
+ TokenERManyToManyAlt // }|..|{
|
|
|
+ TokenERZeroToOne // ||--o|
|
|
|
+
|
|
|
+ // Class diagram relationship tokens
|
|
|
+ TokenClassInheritance // <|--
|
|
|
+ TokenClassComposition // *--
|
|
|
+ TokenClassAggregation // o--
|
|
|
+ TokenClassAssociation // -->
|
|
|
+ TokenClassRealization // ..|>
|
|
|
+ TokenClassDependency // ..>
|
|
|
+
|
|
|
+ // ER diagram cardinality tokens
|
|
|
+ TokenERCardOnlyOne // ||
|
|
|
+ TokenERCardZeroOrOne // o
|
|
|
+ TokenERCardZeroOrMore // o{
|
|
|
+ TokenERCardOneOrMore // |{
|
|
|
+ TokenERCardCloseOne // }|
|
|
|
+ TokenERCardCloseZero // }o
|
|
|
|
|
|
// Edge modifiers
|
|
|
TokenPipe // |
|
|
|
@@ -140,6 +164,9 @@ var tokenTypeNames = map[TokenType]string{
|
|
|
TokenBT: "BT",
|
|
|
TokenRL: "RL",
|
|
|
TokenLR: "LR",
|
|
|
+ TokenEntry: "ENTRY",
|
|
|
+ TokenExit: "EXIT",
|
|
|
+ TokenDo: "DO",
|
|
|
TokenID: "ID",
|
|
|
TokenString: "STRING",
|
|
|
TokenNodeString: "NODE_STRING",
|
|
|
@@ -163,10 +190,24 @@ var tokenTypeNames = map[TokenType]string{
|
|
|
TokenArrowCross: "ARROW_CROSS",
|
|
|
TokenArrowCircle: "ARROW_CIRCLE",
|
|
|
TokenEROneToMany: "ER_ONE_TO_MANY",
|
|
|
+ TokenEROneToManyAlt: "ER_ONE_TO_MANY_ALT",
|
|
|
TokenERManyToOne: "ER_MANY_TO_ONE",
|
|
|
TokenEROneToOne: "ER_ONE_TO_ONE",
|
|
|
TokenERManyToMany: "ER_MANY_TO_MANY",
|
|
|
+ TokenERManyToManyAlt: "ER_MANY_TO_MANY_ALT",
|
|
|
TokenERZeroToOne: "ER_ZERO_TO_ONE",
|
|
|
+ TokenClassInheritance: "CLASS_INHERITANCE",
|
|
|
+ TokenClassComposition: "CLASS_COMPOSITION",
|
|
|
+ TokenClassAggregation: "CLASS_AGGREGATION",
|
|
|
+ TokenClassAssociation: "CLASS_ASSOCIATION",
|
|
|
+ TokenClassRealization: "CLASS_REALIZATION",
|
|
|
+ TokenClassDependency: "CLASS_DEPENDENCY",
|
|
|
+ TokenERCardOnlyOne: "ER_CARD_ONLY_ONE",
|
|
|
+ TokenERCardZeroOrOne: "ER_CARD_ZERO_OR_ONE",
|
|
|
+ TokenERCardZeroOrMore: "ER_CARD_ZERO_OR_MORE",
|
|
|
+ TokenERCardOneOrMore: "ER_CARD_ONE_OR_MORE",
|
|
|
+ TokenERCardCloseOne: "ER_CARD_CLOSE_ONE",
|
|
|
+ TokenERCardCloseZero: "ER_CARD_CLOSE_ZERO",
|
|
|
TokenPipe: "PIPE",
|
|
|
TokenColon: "COLON",
|
|
|
TokenSemicolon: "SEMICOLON",
|
|
|
@@ -243,8 +284,8 @@ func (l *Lexer) nextToken() error {
|
|
|
return nil
|
|
|
}
|
|
|
|
|
|
- // Keywords and identifiers
|
|
|
- if unicode.IsLetter(ch) || ch == '_' {
|
|
|
+ // Keywords and identifiers - support Unicode letters including Chinese
|
|
|
+ if l.isUnicodeIdentifierStart(ch) {
|
|
|
return l.consumeIdentifier()
|
|
|
}
|
|
|
|
|
|
@@ -264,6 +305,28 @@ func (l *Lexer) nextToken() error {
|
|
|
|
|
|
// tryMultiCharOperator attempts to match multi-character operators
|
|
|
func (l *Lexer) tryMultiCharOperator() TokenType {
|
|
|
+ // Check for state diagram special markers first
|
|
|
+ if l.matchString("<<fork>>") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "<<fork>>", 8)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+ if l.matchString("<<join>>") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "<<join>>", 8)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+ if l.matchString("<<choice>>") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "<<choice>>", 10)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+ if l.matchString("<<history>>") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "<<history>>", 11)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+ if l.matchString("<<deepHistory>>") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "<<deepHistory>>", 15)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+
|
|
|
// Check for ER diagram relationships first (need to be before shorter patterns)
|
|
|
if l.matchString("||--o{") {
|
|
|
l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6)
|
|
|
@@ -285,16 +348,139 @@ func (l *Lexer) tryMultiCharOperator() TokenType {
|
|
|
l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6)
|
|
|
return TokenERZeroToOne
|
|
|
}
|
|
|
+ if l.matchString("||--|{") {
|
|
|
+ l.addTokenAndAdvance(TokenEROneToManyAlt, "||--|{", 6)
|
|
|
+ return TokenEROneToManyAlt
|
|
|
+ }
|
|
|
+ if l.matchString("}|..|{") {
|
|
|
+ l.addTokenAndAdvance(TokenERManyToManyAlt, "}|..|{", 6)
|
|
|
+ return TokenERManyToManyAlt
|
|
|
+ }
|
|
|
|
|
|
- // Check for sequence diagram arrows
|
|
|
- if l.matchString("->>") {
|
|
|
- l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
|
|
|
+ // Check for class diagram relationship symbols
|
|
|
+ if l.matchString("<|--") {
|
|
|
+ l.addTokenAndAdvance(TokenClassInheritance, "<|--", 4)
|
|
|
+ return TokenClassInheritance
|
|
|
+ }
|
|
|
+ if l.matchString("*--") {
|
|
|
+ l.addTokenAndAdvance(TokenClassComposition, "*--", 3)
|
|
|
+ return TokenClassComposition
|
|
|
+ }
|
|
|
+ if l.matchString("o--") {
|
|
|
+ l.addTokenAndAdvance(TokenClassAggregation, "o--", 3)
|
|
|
+ return TokenClassAggregation
|
|
|
+ }
|
|
|
+ if l.matchString("-->") {
|
|
|
+ l.addTokenAndAdvance(TokenClassAssociation, "-->", 3)
|
|
|
+ return TokenClassAssociation
|
|
|
+ }
|
|
|
+ if l.matchString("..|>") {
|
|
|
+ l.addTokenAndAdvance(TokenClassRealization, "..|>", 4)
|
|
|
+ return TokenClassRealization
|
|
|
+ }
|
|
|
+ if l.matchString("..>") {
|
|
|
+ l.addTokenAndAdvance(TokenClassDependency, "..>", 3)
|
|
|
+ return TokenClassDependency
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check for ER diagram cardinality symbols
|
|
|
+ if l.matchString("||") {
|
|
|
+ l.addTokenAndAdvance(TokenERCardOnlyOne, "||", 2)
|
|
|
+ return TokenERCardOnlyOne
|
|
|
+ }
|
|
|
+ if l.matchString("o{") {
|
|
|
+ l.addTokenAndAdvance(TokenERCardZeroOrMore, "o{", 2)
|
|
|
+ return TokenERCardZeroOrMore
|
|
|
+ }
|
|
|
+ if l.matchString("|{") {
|
|
|
+ l.addTokenAndAdvance(TokenERCardOneOrMore, "|{", 2)
|
|
|
+ return TokenERCardOneOrMore
|
|
|
+ }
|
|
|
+ if l.matchString("}|") {
|
|
|
+ l.addTokenAndAdvance(TokenERCardCloseOne, "}|", 2)
|
|
|
+ return TokenERCardCloseOne
|
|
|
+ }
|
|
|
+ if l.matchString("}o") {
|
|
|
+ l.addTokenAndAdvance(TokenERCardCloseZero, "}o", 2)
|
|
|
+ return TokenERCardCloseZero
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check for special diagram type declarations first
|
|
|
+ if l.matchString("stateDiagram-v2") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "stateDiagram-v2", 15)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+ if l.matchString("stateDiagram") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "stateDiagram", 12)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+ if l.matchString("sequenceDiagram") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "sequenceDiagram", 15)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+ if l.matchString("erDiagram") {
|
|
|
+ l.addTokenAndAdvance(TokenID, "erDiagram", 9)
|
|
|
+ return TokenID
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check for sequence diagram arrows (longer patterns first, following mermaidjs patterns)
|
|
|
+ // Bidirectional arrows
|
|
|
+ if l.matchString("<<-->>") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowDotted, "<<-->>", 6)
|
|
|
+ return TokenArrowDotted
|
|
|
+ }
|
|
|
+ if l.matchString("<<->>") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowSolid, "<<->>", 5)
|
|
|
return TokenArrowSolid
|
|
|
}
|
|
|
+
|
|
|
+ // Complex arrows with directional markers
|
|
|
+ if l.matchString("--|\\") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowSolid, "--|\\", 4)
|
|
|
+ return TokenArrowSolid
|
|
|
+ }
|
|
|
+ if l.matchString("--|/") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowSolid, "--|/", 4)
|
|
|
+ return TokenArrowSolid
|
|
|
+ }
|
|
|
+ if l.matchString("-|\\") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowSolid, "-|\\", 3)
|
|
|
+ return TokenArrowSolid
|
|
|
+ }
|
|
|
+ if l.matchString("-|/") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowSolid, "-|/", 3)
|
|
|
+ return TokenArrowSolid
|
|
|
+ }
|
|
|
+
|
|
|
+ // Standard arrows
|
|
|
if l.matchString("-->>") {
|
|
|
l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4)
|
|
|
return TokenArrowDotted
|
|
|
}
|
|
|
+ if l.matchString("->>") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
|
|
|
+ return TokenArrowSolid
|
|
|
+ }
|
|
|
+ if l.matchString("<->") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowSolid, "<->", 3)
|
|
|
+ return TokenArrowSolid
|
|
|
+ }
|
|
|
+ if l.matchString("-x") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowCross, "-x", 2)
|
|
|
+ return TokenArrowCross
|
|
|
+ }
|
|
|
+ if l.matchString("--)") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowOpen, "--)", 3)
|
|
|
+ return TokenArrowOpen
|
|
|
+ }
|
|
|
+ if l.matchString("->") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowSolid, "->", 2)
|
|
|
+ return TokenArrowSolid
|
|
|
+ }
|
|
|
+ if l.matchString("-)") {
|
|
|
+ l.addTokenAndAdvance(TokenArrowOpen, "-)", 2)
|
|
|
+ return TokenArrowOpen
|
|
|
+ }
|
|
|
|
|
|
// Check for arrows - based on destructLink patterns
|
|
|
if l.matchString("==>") {
|
|
|
@@ -330,6 +516,12 @@ func (l *Lexer) tryMultiCharOperator() TokenType {
|
|
|
return TokenCloseDoubleParen
|
|
|
}
|
|
|
|
|
|
+ // Check for single character ER cardinality symbols
|
|
|
+ if l.matchString("o") && !l.isNextChar('{') && !l.isNextChar('-') && !l.isNextCharLetter() {
|
|
|
+ l.addTokenAndAdvance(TokenERCardZeroOrOne, "o", 1)
|
|
|
+ return TokenERCardZeroOrOne
|
|
|
+ }
|
|
|
+
|
|
|
return TokenError
|
|
|
}
|
|
|
|
|
|
@@ -376,17 +568,17 @@ func (l *Lexer) consumeComment() error {
|
|
|
return nil
|
|
|
}
|
|
|
|
|
|
-// consumeIdentifier consumes identifiers and keywords
|
|
|
+// consumeIdentifier consumes identifiers and keywords with Unicode support
|
|
|
func (l *Lexer) consumeIdentifier() error {
|
|
|
start := l.position
|
|
|
|
|
|
// First character already validated
|
|
|
l.advance()
|
|
|
|
|
|
- // Continue with alphanumeric and underscore
|
|
|
+ // Continue with Unicode identifier characters
|
|
|
for l.position < len(l.input) {
|
|
|
ch := l.current()
|
|
|
- if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' {
|
|
|
+ if l.isUnicodeIdentifierChar(ch) {
|
|
|
l.advance()
|
|
|
} else {
|
|
|
break
|
|
|
@@ -434,6 +626,27 @@ func (l *Lexer) getKeywordType(value string) TokenType {
|
|
|
return TokenRL
|
|
|
case "lr":
|
|
|
return TokenLR
|
|
|
+ // State diagram keywords
|
|
|
+ case "state":
|
|
|
+ return TokenID
|
|
|
+ case "diagram":
|
|
|
+ return TokenID
|
|
|
+ case "statediagram-v2":
|
|
|
+ return TokenID
|
|
|
+ case "statediagram":
|
|
|
+ return TokenID
|
|
|
+ case "entry":
|
|
|
+ return TokenEntry
|
|
|
+ case "exit":
|
|
|
+ return TokenExit
|
|
|
+ case "do":
|
|
|
+ return TokenDo
|
|
|
+ case "pk":
|
|
|
+ return TokenID
|
|
|
+ case "fk":
|
|
|
+ return TokenID
|
|
|
+ case "uk":
|
|
|
+ return TokenID
|
|
|
default:
|
|
|
return TokenID
|
|
|
}
|
|
|
@@ -455,6 +668,30 @@ func (l *Lexer) consumeNumber() error {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ // Check for duration suffixes (d, w, m, y, h)
|
|
|
+ if l.position < len(l.input) {
|
|
|
+ ch := l.current()
|
|
|
+ if ch == 'd' || ch == 'w' || ch == 'm' || ch == 'y' || ch == 'h' {
|
|
|
+ l.advance()
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check for date format (YYYY-MM-DD) - look for additional numbers with dashes
|
|
|
+ if l.position < len(l.input) && l.current() == '-' {
|
|
|
+ l.advance() // consume the dash
|
|
|
+ // Look for more digits after dash
|
|
|
+ for l.position < len(l.input) && unicode.IsDigit(l.current()) {
|
|
|
+ l.advance()
|
|
|
+ }
|
|
|
+ // Look for another dash and more digits
|
|
|
+ if l.position < len(l.input) && l.current() == '-' {
|
|
|
+ l.advance() // consume the second dash
|
|
|
+ for l.position < len(l.input) && unicode.IsDigit(l.current()) {
|
|
|
+ l.advance()
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
l.addToken(TokenNumber, l.input[start:l.position])
|
|
|
return nil
|
|
|
}
|
|
|
@@ -560,27 +797,34 @@ func (l *Lexer) current() rune {
|
|
|
if l.position >= len(l.input) {
|
|
|
return 0
|
|
|
}
|
|
|
- return rune(l.input[l.position])
|
|
|
+ r, _ := utf8.DecodeRuneInString(l.input[l.position:])
|
|
|
+ return r
|
|
|
}
|
|
|
|
|
|
// peek returns the next character without advancing
|
|
|
func (l *Lexer) peek() rune {
|
|
|
- if l.position+1 >= len(l.input) {
|
|
|
+ if l.position >= len(l.input) {
|
|
|
+ return 0
|
|
|
+ }
|
|
|
+ _, size := utf8.DecodeRuneInString(l.input[l.position:])
|
|
|
+ if l.position+size >= len(l.input) {
|
|
|
return 0
|
|
|
}
|
|
|
- return rune(l.input[l.position+1])
|
|
|
+ nextR, _ := utf8.DecodeRuneInString(l.input[l.position+size:])
|
|
|
+ return nextR
|
|
|
}
|
|
|
|
|
|
// advance moves to the next character
|
|
|
func (l *Lexer) advance() {
|
|
|
if l.position < len(l.input) {
|
|
|
- if l.input[l.position] == '\n' {
|
|
|
+ r, size := utf8.DecodeRuneInString(l.input[l.position:])
|
|
|
+ if r == '\n' {
|
|
|
l.line++
|
|
|
l.column = 1
|
|
|
} else {
|
|
|
l.column++
|
|
|
}
|
|
|
- l.position++
|
|
|
+ l.position += size
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -592,26 +836,80 @@ func (l *Lexer) matchString(s string) bool {
|
|
|
return l.input[l.position:l.position+len(s)] == s
|
|
|
}
|
|
|
|
|
|
+// isNextChar checks if the next character matches the given character
|
|
|
+func (l *Lexer) isNextChar(ch byte) bool {
|
|
|
+ if l.position+1 >= len(l.input) {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ return l.input[l.position+1] == ch
|
|
|
+}
|
|
|
+
|
|
|
+// isNextCharLetter checks if the next character is a letter
|
|
|
+func (l *Lexer) isNextCharLetter() bool {
|
|
|
+ if l.position+1 >= len(l.input) {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ return unicode.IsLetter(rune(l.input[l.position+1]))
|
|
|
+}
|
|
|
+
|
|
|
// addToken adds a token to the token list
|
|
|
func (l *Lexer) addToken(tokenType TokenType, value string) {
|
|
|
+ runeCount := utf8.RuneCountInString(value)
|
|
|
+ byteCount := len(value)
|
|
|
token := Token{
|
|
|
Type: tokenType,
|
|
|
Value: value,
|
|
|
Line: l.line,
|
|
|
- Column: l.column - len(value),
|
|
|
- Position: l.position - len(value),
|
|
|
+ Column: l.column - runeCount,
|
|
|
+ Position: l.position - byteCount,
|
|
|
}
|
|
|
l.tokens = append(l.tokens, token)
|
|
|
}
|
|
|
|
|
|
-// addTokenAndAdvance adds a token and advances position
|
|
|
-func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, length int) {
|
|
|
+// addTokenAndAdvance adds a token and advances position by the specified number of runes
|
|
|
+func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, runeCount int) {
|
|
|
l.addToken(tokenType, value)
|
|
|
- for i := 0; i < length; i++ {
|
|
|
+ for i := 0; i < runeCount; i++ {
|
|
|
l.advance()
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+// isChineseChar checks if a character is a Chinese character
|
|
|
+func (l *Lexer) isChineseChar(ch rune) bool {
|
|
|
+ // Chinese character ranges in Unicode
|
|
|
+ return (ch >= 0x4e00 && ch <= 0x9fff) || // CJK Unified Ideographs
|
|
|
+ (ch >= 0x3400 && ch <= 0x4dbf) || // CJK Extension A
|
|
|
+ (ch >= 0x20000 && ch <= 0x2a6df) || // CJK Extension B
|
|
|
+ (ch >= 0x2a700 && ch <= 0x2b73f) || // CJK Extension C
|
|
|
+ (ch >= 0x2b740 && ch <= 0x2b81f) || // CJK Extension D
|
|
|
+ (ch >= 0x2b820 && ch <= 0x2ceaf) || // CJK Extension E
|
|
|
+ (ch >= 0x2ceb0 && ch <= 0x2ebef) || // CJK Extension F
|
|
|
+ (ch >= 0x30000 && ch <= 0x3134f) || // CJK Extension G
|
|
|
+ (ch >= 0x3190 && ch <= 0x319f) || // Kanbun
|
|
|
+ (ch >= 0x31c0 && ch <= 0x31ef) || // CJK Strokes
|
|
|
+ (ch >= 0x2e80 && ch <= 0x2eff) || // CJK Radicals Supplement
|
|
|
+ (ch >= 0x2f00 && ch <= 0x2fdf) || // Kangxi Radicals
|
|
|
+ (ch >= 0x2ff0 && ch <= 0x2fff) || // Ideographic Description Characters
|
|
|
+ (ch >= 0x3000 && ch <= 0x303f) || // CJK Symbols and Punctuation
|
|
|
+ (ch >= 0x3040 && ch <= 0x309f) || // Hiragana
|
|
|
+ (ch >= 0x30a0 && ch <= 0x30ff) || // Katakana
|
|
|
+ (ch >= 0x3100 && ch <= 0x312f) || // Bopomofo
|
|
|
+ (ch >= 0x3130 && ch <= 0x318f) || // Hangul Compatibility Jamo
|
|
|
+ (ch >= 0x31a0 && ch <= 0x31bf) || // Bopomofo Extended
|
|
|
+ (ch >= 0xac00 && ch <= 0xd7af) || // Hangul Syllables
|
|
|
+ (ch >= 0xff00 && ch <= 0xffef) // Halfwidth and Fullwidth Forms
|
|
|
+}
|
|
|
+
|
|
|
+// isUnicodeIdentifierStart checks if a character can start a Unicode identifier
|
|
|
+func (l *Lexer) isUnicodeIdentifierStart(ch rune) bool {
|
|
|
+ return unicode.IsLetter(ch) || ch == '_' || l.isChineseChar(ch)
|
|
|
+}
|
|
|
+
|
|
|
+// isUnicodeIdentifierChar checks if a character can be part of a Unicode identifier
|
|
|
+func (l *Lexer) isUnicodeIdentifierChar(ch rune) bool {
|
|
|
+ return unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' || ch == '-' || l.isChineseChar(ch)
|
|
|
+}
|
|
|
+
|
|
|
// FilterTokens removes whitespace and comment tokens for parsing
|
|
|
func FilterTokens(tokens []Token) []Token {
|
|
|
filtered := make([]Token, 0, len(tokens))
|