lexer.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919
  1. // Package lexer provides lexical analysis for Mermaid diagram syntax.
  2. // Based on the lexical rules from flow.jison in mermaid.js
  3. package lexer
  4. import (
  5. "fmt"
  6. _ "regexp"
  7. "strings"
  8. "unicode"
  9. "unicode/utf8"
  10. )
  11. // TokenType represents the type of a lexical token
  12. type TokenType int
  13. const (
  14. // Special tokens
  15. TokenEOF TokenType = iota
  16. TokenNewline
  17. TokenSpace
  18. TokenComment
  19. // Keywords - from flow.jison
  20. TokenGraph
  21. TokenSubgraph
  22. TokenEnd
  23. TokenDirection
  24. TokenClass
  25. TokenClassDef
  26. TokenClick
  27. TokenStyle
  28. TokenLinkStyle
  29. TokenDefault
  30. // Directions
  31. TokenTD // Top Down
  32. TokenTB // Top Bottom
  33. TokenBT // Bottom Top
  34. TokenRL // Right Left
  35. TokenLR // Left Right
  36. // State diagram action keywords
  37. TokenEntry // entry
  38. TokenExit // exit
  39. TokenDo // do
  40. // Identifiers and literals
  41. TokenID
  42. TokenString
  43. TokenNodeString
  44. TokenNumber
  45. TokenUnicodeText
  46. // Shape delimiters - following JISON patterns
  47. TokenOpenBracket // [
  48. TokenCloseBracket // ]
  49. TokenOpenParen // (
  50. TokenCloseParen // )
  51. TokenOpenBrace // {
  52. TokenCloseBrace // }
  53. TokenOpenDoubleParen // ((
  54. TokenCloseDoubleParen // ))
  55. TokenOpenAngle // <
  56. TokenCloseAngle // >
  57. // Edge tokens - from destructLink logic in flowDb.ts
  58. TokenArrowSolid // -->
  59. TokenArrowDotted // -.->
  60. TokenArrowThick // ==>
  61. TokenArrowOpen // ---
  62. TokenArrowPoint // -->
  63. TokenArrowCross // --x
  64. TokenArrowCircle // --o
  65. // ER diagram relationship tokens
  66. TokenEROneToMany // ||--o{
  67. TokenEROneToManyAlt // ||--|{
  68. TokenERManyToOne // }o--||
  69. TokenEROneToOne // ||--||
  70. TokenERManyToMany // }o--o{
  71. TokenERManyToManyAlt // }|..|{
  72. TokenERZeroToOne // ||--o|
  73. // Class diagram relationship tokens
  74. TokenClassInheritance // <|--
  75. TokenClassComposition // *--
  76. TokenClassAggregation // o--
  77. TokenClassAssociation // -->
  78. TokenClassRealization // ..|>
  79. TokenClassDependency // ..>
  80. // ER diagram cardinality tokens
  81. TokenERCardOnlyOne // ||
  82. TokenERCardZeroOrOne // o
  83. TokenERCardZeroOrMore // o{
  84. TokenERCardOneOrMore // |{
  85. TokenERCardCloseOne // }|
  86. TokenERCardCloseZero // }o
  87. // Edge modifiers
  88. TokenPipe // |
  89. TokenColon // :
  90. TokenSemicolon // ;
  91. TokenComma // ,
  92. TokenAmpersand // &
  93. TokenMult // *
  94. TokenPlus // +
  95. TokenMinus // -
  96. TokenEquals // =
  97. TokenDot // .
  98. TokenExclamation // !
  99. TokenSlash // /
  100. TokenBackslash // \
  101. TokenHash // #
  102. TokenAt // @
  103. TokenPercent // %
  104. TokenTilde // ~
  105. TokenQuestion // ?
  106. // Error token
  107. TokenError
  108. )
  109. // Token represents a lexical token
  110. type Token struct {
  111. Type TokenType
  112. Value string
  113. Line int
  114. Column int
  115. Position int
  116. }
  117. // String returns a string representation of the token
  118. func (t Token) String() string {
  119. return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}",
  120. t.Type.String(), t.Value, t.Line, t.Column)
  121. }
  122. // String returns the string representation of TokenType
  123. func (tt TokenType) String() string {
  124. if name, exists := tokenTypeNames[tt]; exists {
  125. return name
  126. }
  127. return fmt.Sprintf("TokenType(%d)", int(tt))
  128. }
  129. var tokenTypeNames = map[TokenType]string{
  130. TokenEOF: "EOF",
  131. TokenNewline: "NEWLINE",
  132. TokenSpace: "SPACE",
  133. TokenComment: "COMMENT",
  134. TokenGraph: "GRAPH",
  135. TokenSubgraph: "SUBGRAPH",
  136. TokenEnd: "END",
  137. TokenDirection: "DIRECTION",
  138. TokenClass: "CLASS",
  139. TokenClassDef: "CLASSDEF",
  140. TokenClick: "CLICK",
  141. TokenStyle: "STYLE",
  142. TokenLinkStyle: "LINKSTYLE",
  143. TokenDefault: "DEFAULT",
  144. TokenTD: "TD",
  145. TokenTB: "TB",
  146. TokenBT: "BT",
  147. TokenRL: "RL",
  148. TokenLR: "LR",
  149. TokenEntry: "ENTRY",
  150. TokenExit: "EXIT",
  151. TokenDo: "DO",
  152. TokenID: "ID",
  153. TokenString: "STRING",
  154. TokenNodeString: "NODE_STRING",
  155. TokenNumber: "NUMBER",
  156. TokenUnicodeText: "UNICODE_TEXT",
  157. TokenOpenBracket: "OPEN_BRACKET",
  158. TokenCloseBracket: "CLOSE_BRACKET",
  159. TokenOpenParen: "OPEN_PAREN",
  160. TokenCloseParen: "CLOSE_PAREN",
  161. TokenOpenBrace: "OPEN_BRACE",
  162. TokenCloseBrace: "CLOSE_BRACE",
  163. TokenOpenDoubleParen: "OPEN_DOUBLE_PAREN",
  164. TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN",
  165. TokenOpenAngle: "OPEN_ANGLE",
  166. TokenCloseAngle: "CLOSE_ANGLE",
  167. TokenArrowSolid: "ARROW_SOLID",
  168. TokenArrowDotted: "ARROW_DOTTED",
  169. TokenArrowThick: "ARROW_THICK",
  170. TokenArrowOpen: "ARROW_OPEN",
  171. TokenArrowPoint: "ARROW_POINT",
  172. TokenArrowCross: "ARROW_CROSS",
  173. TokenArrowCircle: "ARROW_CIRCLE",
  174. TokenEROneToMany: "ER_ONE_TO_MANY",
  175. TokenEROneToManyAlt: "ER_ONE_TO_MANY_ALT",
  176. TokenERManyToOne: "ER_MANY_TO_ONE",
  177. TokenEROneToOne: "ER_ONE_TO_ONE",
  178. TokenERManyToMany: "ER_MANY_TO_MANY",
  179. TokenERManyToManyAlt: "ER_MANY_TO_MANY_ALT",
  180. TokenERZeroToOne: "ER_ZERO_TO_ONE",
  181. TokenClassInheritance: "CLASS_INHERITANCE",
  182. TokenClassComposition: "CLASS_COMPOSITION",
  183. TokenClassAggregation: "CLASS_AGGREGATION",
  184. TokenClassAssociation: "CLASS_ASSOCIATION",
  185. TokenClassRealization: "CLASS_REALIZATION",
  186. TokenClassDependency: "CLASS_DEPENDENCY",
  187. TokenERCardOnlyOne: "ER_CARD_ONLY_ONE",
  188. TokenERCardZeroOrOne: "ER_CARD_ZERO_OR_ONE",
  189. TokenERCardZeroOrMore: "ER_CARD_ZERO_OR_MORE",
  190. TokenERCardOneOrMore: "ER_CARD_ONE_OR_MORE",
  191. TokenERCardCloseOne: "ER_CARD_CLOSE_ONE",
  192. TokenERCardCloseZero: "ER_CARD_CLOSE_ZERO",
  193. TokenPipe: "PIPE",
  194. TokenColon: "COLON",
  195. TokenSemicolon: "SEMICOLON",
  196. TokenComma: "COMMA",
  197. TokenAmpersand: "AMPERSAND",
  198. TokenMult: "MULT",
  199. TokenPlus: "PLUS",
  200. TokenMinus: "MINUS",
  201. TokenEquals: "EQUALS",
  202. TokenDot: "DOT",
  203. TokenExclamation: "EXCLAMATION",
  204. TokenSlash: "SLASH",
  205. TokenBackslash: "BACKSLASH",
  206. TokenHash: "HASH",
  207. TokenAt: "AT",
  208. TokenPercent: "PERCENT",
  209. TokenTilde: "TILDE",
  210. TokenQuestion: "QUESTION",
  211. TokenError: "ERROR",
  212. }
  213. // Lexer performs lexical analysis on mermaid input
  214. type Lexer struct {
  215. input string
  216. position int
  217. line int
  218. column int
  219. tokens []Token
  220. }
  221. // NewLexer creates a new lexer for the given input
  222. func NewLexer(input string) *Lexer {
  223. return &Lexer{
  224. input: input,
  225. line: 1,
  226. column: 1,
  227. tokens: make([]Token, 0),
  228. }
  229. }
  230. // Tokenize performs lexical analysis and returns all tokens
  231. func (l *Lexer) Tokenize() ([]Token, error) {
  232. for l.position < len(l.input) {
  233. if err := l.nextToken(); err != nil {
  234. return nil, err
  235. }
  236. }
  237. // Add EOF token
  238. l.addToken(TokenEOF, "")
  239. return l.tokens, nil
  240. }
  241. // nextToken processes the next token from input
  242. func (l *Lexer) nextToken() error {
  243. if l.position >= len(l.input) {
  244. return nil
  245. }
  246. ch := l.current()
  247. // Skip whitespace but track newlines
  248. if unicode.IsSpace(ch) {
  249. return l.consumeWhitespace()
  250. }
  251. // Comments - following mermaid.js pattern
  252. if ch == '%' && l.peek() == '%' {
  253. return l.consumeComment()
  254. }
  255. // Multi-character operators first (order matters!)
  256. if multiChar := l.tryMultiCharOperator(); multiChar != TokenError {
  257. return nil
  258. }
  259. // Keywords and identifiers - support Unicode letters including Chinese
  260. if l.isUnicodeIdentifierStart(ch) {
  261. return l.consumeIdentifier()
  262. }
  263. // Numbers
  264. if unicode.IsDigit(ch) {
  265. return l.consumeNumber()
  266. }
  267. // Strings
  268. if ch == '"' {
  269. return l.consumeString()
  270. }
  271. // Single character tokens
  272. return l.consumeSingleChar()
  273. }
  274. // tryMultiCharOperator attempts to match multi-character operators
  275. func (l *Lexer) tryMultiCharOperator() TokenType {
  276. // Check for state diagram special markers first
  277. if l.matchString("<<fork>>") {
  278. l.addTokenAndAdvance(TokenID, "<<fork>>", 8)
  279. return TokenID
  280. }
  281. if l.matchString("<<join>>") {
  282. l.addTokenAndAdvance(TokenID, "<<join>>", 8)
  283. return TokenID
  284. }
  285. if l.matchString("<<choice>>") {
  286. l.addTokenAndAdvance(TokenID, "<<choice>>", 10)
  287. return TokenID
  288. }
  289. if l.matchString("<<history>>") {
  290. l.addTokenAndAdvance(TokenID, "<<history>>", 11)
  291. return TokenID
  292. }
  293. if l.matchString("<<deepHistory>>") {
  294. l.addTokenAndAdvance(TokenID, "<<deepHistory>>", 15)
  295. return TokenID
  296. }
  297. // Check for ER diagram relationships first (need to be before shorter patterns)
  298. if l.matchString("||--o{") {
  299. l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6)
  300. return TokenEROneToMany
  301. }
  302. if l.matchString("}o--||") {
  303. l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6)
  304. return TokenERManyToOne
  305. }
  306. if l.matchString("||--||") {
  307. l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6)
  308. return TokenEROneToOne
  309. }
  310. if l.matchString("}o--o{") {
  311. l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6)
  312. return TokenERManyToMany
  313. }
  314. if l.matchString("||--o|") {
  315. l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6)
  316. return TokenERZeroToOne
  317. }
  318. if l.matchString("||--|{") {
  319. l.addTokenAndAdvance(TokenEROneToManyAlt, "||--|{", 6)
  320. return TokenEROneToManyAlt
  321. }
  322. if l.matchString("}|..|{") {
  323. l.addTokenAndAdvance(TokenERManyToManyAlt, "}|..|{", 6)
  324. return TokenERManyToManyAlt
  325. }
  326. // Check for class diagram relationship symbols (but not --> which conflicts with flowchart)
  327. if l.matchString("<|--") {
  328. l.addTokenAndAdvance(TokenClassInheritance, "<|--", 4)
  329. return TokenClassInheritance
  330. }
  331. if l.matchString("*--") {
  332. l.addTokenAndAdvance(TokenClassComposition, "*--", 3)
  333. return TokenClassComposition
  334. }
  335. if l.matchString("o--") {
  336. l.addTokenAndAdvance(TokenClassAggregation, "o--", 3)
  337. return TokenClassAggregation
  338. }
  339. // Note: --> is handled later as TokenArrowSolid for flowcharts
  340. if l.matchString("..|>") {
  341. l.addTokenAndAdvance(TokenClassRealization, "..|>", 4)
  342. return TokenClassRealization
  343. }
  344. if l.matchString("..>") {
  345. l.addTokenAndAdvance(TokenClassDependency, "..>", 3)
  346. return TokenClassDependency
  347. }
  348. // Check for ER diagram cardinality symbols
  349. if l.matchString("||") {
  350. l.addTokenAndAdvance(TokenERCardOnlyOne, "||", 2)
  351. return TokenERCardOnlyOne
  352. }
  353. if l.matchString("o{") {
  354. l.addTokenAndAdvance(TokenERCardZeroOrMore, "o{", 2)
  355. return TokenERCardZeroOrMore
  356. }
  357. if l.matchString("|{") {
  358. l.addTokenAndAdvance(TokenERCardOneOrMore, "|{", 2)
  359. return TokenERCardOneOrMore
  360. }
  361. if l.matchString("}|") {
  362. l.addTokenAndAdvance(TokenERCardCloseOne, "}|", 2)
  363. return TokenERCardCloseOne
  364. }
  365. if l.matchString("}o") {
  366. l.addTokenAndAdvance(TokenERCardCloseZero, "}o", 2)
  367. return TokenERCardCloseZero
  368. }
  369. // Check for special diagram type declarations first
  370. if l.matchString("stateDiagram-v2") {
  371. l.addTokenAndAdvance(TokenID, "stateDiagram-v2", 15)
  372. return TokenID
  373. }
  374. if l.matchString("stateDiagram") {
  375. l.addTokenAndAdvance(TokenID, "stateDiagram", 12)
  376. return TokenID
  377. }
  378. if l.matchString("sequenceDiagram") {
  379. l.addTokenAndAdvance(TokenID, "sequenceDiagram", 15)
  380. return TokenID
  381. }
  382. if l.matchString("erDiagram") {
  383. l.addTokenAndAdvance(TokenID, "erDiagram", 9)
  384. return TokenID
  385. }
  386. // Check for sequence diagram arrows (longer patterns first, following mermaidjs patterns)
  387. // Bidirectional arrows
  388. if l.matchString("<<-->>") {
  389. l.addTokenAndAdvance(TokenArrowDotted, "<<-->>", 6)
  390. return TokenArrowDotted
  391. }
  392. if l.matchString("<<->>") {
  393. l.addTokenAndAdvance(TokenArrowSolid, "<<->>", 5)
  394. return TokenArrowSolid
  395. }
  396. // Complex arrows with directional markers
  397. if l.matchString("--|\\") {
  398. l.addTokenAndAdvance(TokenArrowSolid, "--|\\", 4)
  399. return TokenArrowSolid
  400. }
  401. if l.matchString("--|/") {
  402. l.addTokenAndAdvance(TokenArrowSolid, "--|/", 4)
  403. return TokenArrowSolid
  404. }
  405. if l.matchString("-|\\") {
  406. l.addTokenAndAdvance(TokenArrowSolid, "-|\\", 3)
  407. return TokenArrowSolid
  408. }
  409. if l.matchString("-|/") {
  410. l.addTokenAndAdvance(TokenArrowSolid, "-|/", 3)
  411. return TokenArrowSolid
  412. }
  413. // Standard arrows
  414. if l.matchString("-->>") {
  415. l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4)
  416. return TokenArrowDotted
  417. }
  418. if l.matchString("->>") {
  419. l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
  420. return TokenArrowSolid
  421. }
  422. if l.matchString("<->") {
  423. l.addTokenAndAdvance(TokenArrowSolid, "<->", 3)
  424. return TokenArrowSolid
  425. }
  426. if l.matchString("-x") {
  427. l.addTokenAndAdvance(TokenArrowCross, "-x", 2)
  428. return TokenArrowCross
  429. }
  430. if l.matchString("--)") {
  431. l.addTokenAndAdvance(TokenArrowOpen, "--)", 3)
  432. return TokenArrowOpen
  433. }
  434. if l.matchString("->") {
  435. l.addTokenAndAdvance(TokenArrowSolid, "->", 2)
  436. return TokenArrowSolid
  437. }
  438. if l.matchString("-)") {
  439. l.addTokenAndAdvance(TokenArrowOpen, "-)", 2)
  440. return TokenArrowOpen
  441. }
  442. // Check for arrows - based on destructLink patterns
  443. if l.matchString("==>") {
  444. l.addTokenAndAdvance(TokenArrowThick, "==>", 3)
  445. return TokenArrowThick
  446. }
  447. if l.matchString("-->") {
  448. l.addTokenAndAdvance(TokenArrowSolid, "-->", 3)
  449. return TokenArrowSolid
  450. }
  451. if l.matchString("-.->") {
  452. l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4)
  453. return TokenArrowDotted
  454. }
  455. if l.matchString("--x") {
  456. l.addTokenAndAdvance(TokenArrowCross, "--x", 3)
  457. return TokenArrowCross
  458. }
  459. if l.matchString("--o") {
  460. l.addTokenAndAdvance(TokenArrowCircle, "--o", 3)
  461. return TokenArrowCircle
  462. }
  463. if l.matchString("---") {
  464. l.addTokenAndAdvance(TokenArrowOpen, "---", 3)
  465. return TokenArrowOpen
  466. }
  467. if l.matchString("((") {
  468. l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2)
  469. return TokenOpenDoubleParen
  470. }
  471. if l.matchString("))") {
  472. l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2)
  473. return TokenCloseDoubleParen
  474. }
  475. // Check for single character ER cardinality symbols
  476. if l.matchString("o") && !l.isNextChar('{') && !l.isNextChar('-') && !l.isNextCharLetter() {
  477. l.addTokenAndAdvance(TokenERCardZeroOrOne, "o", 1)
  478. return TokenERCardZeroOrOne
  479. }
  480. return TokenError
  481. }
  482. // consumeWhitespace consumes whitespace characters
  483. func (l *Lexer) consumeWhitespace() error {
  484. start := l.position
  485. for l.position < len(l.input) && unicode.IsSpace(l.current()) {
  486. if l.current() == '\n' {
  487. l.line++
  488. l.column = 1
  489. l.position++
  490. // Add newline token for significant newlines
  491. if start < l.position-1 {
  492. l.addToken(TokenSpace, l.input[start:l.position-1])
  493. }
  494. l.addToken(TokenNewline, "\n")
  495. return nil
  496. } else {
  497. l.advance()
  498. }
  499. }
  500. if start < l.position {
  501. l.addToken(TokenSpace, l.input[start:l.position])
  502. }
  503. return nil
  504. }
  505. // consumeComment consumes a comment line
  506. func (l *Lexer) consumeComment() error {
  507. start := l.position
  508. // Skip %%
  509. l.advance()
  510. l.advance()
  511. // Read until end of line
  512. for l.position < len(l.input) && l.current() != '\n' {
  513. l.advance()
  514. }
  515. l.addToken(TokenComment, l.input[start:l.position])
  516. return nil
  517. }
  518. // consumeIdentifier consumes identifiers and keywords with Unicode support
  519. func (l *Lexer) consumeIdentifier() error {
  520. start := l.position
  521. // First character already validated
  522. l.advance()
  523. // Continue with Unicode identifier characters
  524. for l.position < len(l.input) {
  525. ch := l.current()
  526. if l.isUnicodeIdentifierChar(ch) {
  527. l.advance()
  528. } else {
  529. break
  530. }
  531. }
  532. value := l.input[start:l.position]
  533. tokenType := l.getKeywordType(value)
  534. l.addToken(tokenType, value)
  535. return nil
  536. }
  537. // getKeywordType returns the token type for keywords, or TokenID for identifiers
  538. func (l *Lexer) getKeywordType(value string) TokenType {
  539. // Keywords from flow.jison
  540. switch strings.ToLower(value) {
  541. case "graph":
  542. return TokenGraph
  543. case "flowchart":
  544. return TokenGraph // flowchart uses same token as graph
  545. case "subgraph":
  546. return TokenSubgraph
  547. case "end":
  548. return TokenEnd
  549. case "class":
  550. return TokenClass
  551. case "classdef":
  552. return TokenClassDef
  553. case "click":
  554. return TokenClick
  555. case "style":
  556. return TokenStyle
  557. case "linkstyle":
  558. return TokenLinkStyle
  559. case "default":
  560. return TokenDefault
  561. // Direction keywords
  562. case "td":
  563. return TokenTD
  564. case "tb":
  565. return TokenTB
  566. case "bt":
  567. return TokenBT
  568. case "rl":
  569. return TokenRL
  570. case "lr":
  571. return TokenLR
  572. // State diagram keywords
  573. case "state":
  574. return TokenID
  575. case "diagram":
  576. return TokenID
  577. case "statediagram-v2":
  578. return TokenID
  579. case "statediagram":
  580. return TokenID
  581. case "entry":
  582. return TokenEntry
  583. case "exit":
  584. return TokenExit
  585. case "do":
  586. return TokenDo
  587. case "pk":
  588. return TokenID
  589. case "fk":
  590. return TokenID
  591. case "uk":
  592. return TokenID
  593. default:
  594. return TokenID
  595. }
  596. }
  597. // consumeNumber consumes numeric literals
  598. func (l *Lexer) consumeNumber() error {
  599. start := l.position
  600. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  601. l.advance()
  602. }
  603. // Handle decimal point
  604. if l.position < len(l.input) && l.current() == '.' {
  605. l.advance()
  606. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  607. l.advance()
  608. }
  609. }
  610. // Check for duration suffixes (d, w, m, y, h)
  611. if l.position < len(l.input) {
  612. ch := l.current()
  613. if ch == 'd' || ch == 'w' || ch == 'm' || ch == 'y' || ch == 'h' {
  614. l.advance()
  615. }
  616. }
  617. // Check for date format (YYYY-MM-DD) - look for additional numbers with dashes
  618. if l.position < len(l.input) && l.current() == '-' {
  619. l.advance() // consume the dash
  620. // Look for more digits after dash
  621. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  622. l.advance()
  623. }
  624. // Look for another dash and more digits
  625. if l.position < len(l.input) && l.current() == '-' {
  626. l.advance() // consume the second dash
  627. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  628. l.advance()
  629. }
  630. }
  631. }
  632. l.addToken(TokenNumber, l.input[start:l.position])
  633. return nil
  634. }
  635. // consumeString consumes quoted string literals
  636. func (l *Lexer) consumeString() error {
  637. start := l.position
  638. // Skip opening quote
  639. l.advance()
  640. for l.position < len(l.input) && l.current() != '"' {
  641. if l.current() == '\\' && l.position+1 < len(l.input) {
  642. // Skip escaped character
  643. l.advance()
  644. l.advance()
  645. } else {
  646. l.advance()
  647. }
  648. }
  649. if l.position >= len(l.input) {
  650. return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column)
  651. }
  652. // Skip closing quote
  653. l.advance()
  654. l.addToken(TokenString, l.input[start:l.position])
  655. return nil
  656. }
  657. // consumeSingleChar handles single character tokens
  658. func (l *Lexer) consumeSingleChar() error {
  659. ch := l.current()
  660. var tokenType TokenType
  661. switch ch {
  662. case '[':
  663. tokenType = TokenOpenBracket
  664. case ']':
  665. tokenType = TokenCloseBracket
  666. case '(':
  667. tokenType = TokenOpenParen
  668. case ')':
  669. tokenType = TokenCloseParen
  670. case '{':
  671. tokenType = TokenOpenBrace
  672. case '}':
  673. tokenType = TokenCloseBrace
  674. case '<':
  675. tokenType = TokenOpenAngle
  676. case '>':
  677. tokenType = TokenCloseAngle
  678. case '|':
  679. tokenType = TokenPipe
  680. case ':':
  681. tokenType = TokenColon
  682. case ';':
  683. tokenType = TokenSemicolon
  684. case ',':
  685. tokenType = TokenComma
  686. case '&':
  687. tokenType = TokenAmpersand
  688. case '*':
  689. tokenType = TokenMult
  690. case '+':
  691. tokenType = TokenPlus
  692. case '-':
  693. tokenType = TokenMinus
  694. case '=':
  695. tokenType = TokenEquals
  696. case '.':
  697. tokenType = TokenDot
  698. case '!':
  699. tokenType = TokenExclamation
  700. case '/':
  701. tokenType = TokenSlash
  702. case '\\':
  703. tokenType = TokenBackslash
  704. case '#':
  705. tokenType = TokenHash
  706. case '@':
  707. tokenType = TokenAt
  708. case '%':
  709. tokenType = TokenPercent
  710. case '~':
  711. tokenType = TokenTilde
  712. case '?':
  713. tokenType = TokenQuestion
  714. default:
  715. return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column)
  716. }
  717. l.addTokenAndAdvance(tokenType, string(ch), 1)
  718. return nil
  719. }
  720. // Helper methods
  721. // current returns the current character
  722. func (l *Lexer) current() rune {
  723. if l.position >= len(l.input) {
  724. return 0
  725. }
  726. r, _ := utf8.DecodeRuneInString(l.input[l.position:])
  727. return r
  728. }
  729. // peek returns the next character without advancing
  730. func (l *Lexer) peek() rune {
  731. if l.position >= len(l.input) {
  732. return 0
  733. }
  734. _, size := utf8.DecodeRuneInString(l.input[l.position:])
  735. if l.position+size >= len(l.input) {
  736. return 0
  737. }
  738. nextR, _ := utf8.DecodeRuneInString(l.input[l.position+size:])
  739. return nextR
  740. }
  741. // advance moves to the next character
  742. func (l *Lexer) advance() {
  743. if l.position < len(l.input) {
  744. r, size := utf8.DecodeRuneInString(l.input[l.position:])
  745. if r == '\n' {
  746. l.line++
  747. l.column = 1
  748. } else {
  749. l.column++
  750. }
  751. l.position += size
  752. }
  753. }
  754. // matchString checks if the input matches the given string at current position
  755. func (l *Lexer) matchString(s string) bool {
  756. if l.position+len(s) > len(l.input) {
  757. return false
  758. }
  759. return l.input[l.position:l.position+len(s)] == s
  760. }
  761. // isNextChar checks if the next character matches the given character
  762. func (l *Lexer) isNextChar(ch byte) bool {
  763. if l.position+1 >= len(l.input) {
  764. return false
  765. }
  766. return l.input[l.position+1] == ch
  767. }
  768. // isNextCharLetter checks if the next character is a letter
  769. func (l *Lexer) isNextCharLetter() bool {
  770. if l.position+1 >= len(l.input) {
  771. return false
  772. }
  773. return unicode.IsLetter(rune(l.input[l.position+1]))
  774. }
  775. // addToken adds a token to the token list
  776. func (l *Lexer) addToken(tokenType TokenType, value string) {
  777. runeCount := utf8.RuneCountInString(value)
  778. byteCount := len(value)
  779. token := Token{
  780. Type: tokenType,
  781. Value: value,
  782. Line: l.line,
  783. Column: l.column - runeCount,
  784. Position: l.position - byteCount,
  785. }
  786. l.tokens = append(l.tokens, token)
  787. }
  788. // addTokenAndAdvance adds a token and advances position by the specified number of runes
  789. func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, runeCount int) {
  790. l.addToken(tokenType, value)
  791. for i := 0; i < runeCount; i++ {
  792. l.advance()
  793. }
  794. }
  795. // isChineseChar checks if a character is a Chinese character
  796. func (l *Lexer) isChineseChar(ch rune) bool {
  797. // Chinese character ranges in Unicode
  798. return (ch >= 0x4e00 && ch <= 0x9fff) || // CJK Unified Ideographs
  799. (ch >= 0x3400 && ch <= 0x4dbf) || // CJK Extension A
  800. (ch >= 0x20000 && ch <= 0x2a6df) || // CJK Extension B
  801. (ch >= 0x2a700 && ch <= 0x2b73f) || // CJK Extension C
  802. (ch >= 0x2b740 && ch <= 0x2b81f) || // CJK Extension D
  803. (ch >= 0x2b820 && ch <= 0x2ceaf) || // CJK Extension E
  804. (ch >= 0x2ceb0 && ch <= 0x2ebef) || // CJK Extension F
  805. (ch >= 0x30000 && ch <= 0x3134f) || // CJK Extension G
  806. (ch >= 0x3190 && ch <= 0x319f) || // Kanbun
  807. (ch >= 0x31c0 && ch <= 0x31ef) || // CJK Strokes
  808. (ch >= 0x2e80 && ch <= 0x2eff) || // CJK Radicals Supplement
  809. (ch >= 0x2f00 && ch <= 0x2fdf) || // Kangxi Radicals
  810. (ch >= 0x2ff0 && ch <= 0x2fff) || // Ideographic Description Characters
  811. (ch >= 0x3000 && ch <= 0x303f) || // CJK Symbols and Punctuation
  812. (ch >= 0x3040 && ch <= 0x309f) || // Hiragana
  813. (ch >= 0x30a0 && ch <= 0x30ff) || // Katakana
  814. (ch >= 0x3100 && ch <= 0x312f) || // Bopomofo
  815. (ch >= 0x3130 && ch <= 0x318f) || // Hangul Compatibility Jamo
  816. (ch >= 0x31a0 && ch <= 0x31bf) || // Bopomofo Extended
  817. (ch >= 0xac00 && ch <= 0xd7af) || // Hangul Syllables
  818. (ch >= 0xff00 && ch <= 0xffef) // Halfwidth and Fullwidth Forms
  819. }
  820. // isUnicodeIdentifierStart checks if a character can start a Unicode identifier
  821. func (l *Lexer) isUnicodeIdentifierStart(ch rune) bool {
  822. return unicode.IsLetter(ch) || ch == '_' || l.isChineseChar(ch)
  823. }
  824. // isUnicodeIdentifierChar checks if a character can be part of a Unicode identifier
  825. func (l *Lexer) isUnicodeIdentifierChar(ch rune) bool {
  826. return unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' || ch == '-' || l.isChineseChar(ch)
  827. }
  828. // FilterTokens removes whitespace and comment tokens for parsing
  829. func FilterTokens(tokens []Token) []Token {
  830. filtered := make([]Token, 0, len(tokens))
  831. for _, token := range tokens {
  832. if token.Type != TokenSpace && token.Type != TokenComment {
  833. filtered = append(filtered, token)
  834. }
  835. }
  836. return filtered
  837. }