lexer.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922
  1. // Package lexer provides lexical analysis for Mermaid diagram syntax.
  2. // Based on the lexical rules from flow.jison in mermaid.js
  3. package lexer
  4. import (
  5. "fmt"
  6. _ "regexp"
  7. "strings"
  8. "unicode"
  9. "unicode/utf8"
  10. )
  11. // TokenType represents the type of a lexical token
  12. type TokenType int
  13. const (
  14. // Special tokens
  15. TokenEOF TokenType = iota
  16. TokenNewline
  17. TokenSpace
  18. TokenComment
  19. // Keywords - from flow.jison
  20. TokenGraph
  21. TokenSubgraph
  22. TokenEnd
  23. TokenDirection
  24. TokenClass
  25. TokenClassDef
  26. TokenClick
  27. TokenStyle
  28. TokenLinkStyle
  29. TokenDefault
  30. // Directions
  31. TokenTD // Top Down
  32. TokenTB // Top Bottom
  33. TokenBT // Bottom Top
  34. TokenRL // Right Left
  35. TokenLR // Left Right
  36. // State diagram action keywords
  37. TokenEntry // entry
  38. TokenExit // exit
  39. TokenDo // do
  40. // Identifiers and literals
  41. TokenID
  42. TokenString
  43. TokenNodeString
  44. TokenNumber
  45. TokenUnicodeText
  46. // Shape delimiters - following JISON patterns
  47. TokenOpenBracket // [
  48. TokenCloseBracket // ]
  49. TokenOpenParen // (
  50. TokenCloseParen // )
  51. TokenOpenBrace // {
  52. TokenCloseBrace // }
  53. TokenOpenDoubleParen // ((
  54. TokenCloseDoubleParen // ))
  55. TokenOpenAngle // <
  56. TokenCloseAngle // >
  57. // Edge tokens - from destructLink logic in flowDb.ts
  58. TokenArrowSolid // -->
  59. TokenArrowDotted // -.->
  60. TokenArrowThick // ==>
  61. TokenArrowOpen // ---
  62. TokenArrowPoint // -->
  63. TokenArrowCross // --x
  64. TokenArrowCircle // --o
  65. // ER diagram relationship tokens
  66. TokenEROneToMany // ||--o{
  67. TokenEROneToManyAlt // ||--|{
  68. TokenERManyToOne // }o--||
  69. TokenEROneToOne // ||--||
  70. TokenERManyToMany // }o--o{
  71. TokenERManyToManyAlt // }|..|{
  72. TokenERZeroToOne // ||--o|
  73. // Class diagram relationship tokens
  74. TokenClassInheritance // <|--
  75. TokenClassComposition // *--
  76. TokenClassAggregation // o--
  77. TokenClassAssociation // -->
  78. TokenClassRealization // ..|>
  79. TokenClassDependency // ..>
  80. // ER diagram cardinality tokens
  81. TokenERCardOnlyOne // ||
  82. TokenERCardZeroOrOne // o
  83. TokenERCardZeroOrMore // o{
  84. TokenERCardOneOrMore // |{
  85. TokenERCardCloseOne // }|
  86. TokenERCardCloseZero // }o
  87. // Edge modifiers
  88. TokenPipe // |
  89. TokenColon // :
  90. TokenSemicolon // ;
  91. TokenComma // ,
  92. TokenAmpersand // &
  93. TokenMult // *
  94. TokenPlus // +
  95. TokenMinus // -
  96. TokenEquals // =
  97. TokenDot // .
  98. TokenExclamation // !
  99. TokenSlash // /
  100. TokenBackslash // \
  101. TokenHash // #
  102. TokenAt // @
  103. TokenPercent // %
  104. TokenTilde // ~
  105. TokenQuestion // ?
  106. // Error token
  107. TokenError
  108. )
  109. // Token represents a lexical token
  110. type Token struct {
  111. Type TokenType
  112. Value string
  113. Line int
  114. Column int
  115. Position int
  116. }
  117. // String returns a string representation of the token
  118. func (t Token) String() string {
  119. return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}",
  120. t.Type.String(), t.Value, t.Line, t.Column)
  121. }
  122. // String returns the string representation of TokenType
  123. func (tt TokenType) String() string {
  124. if name, exists := tokenTypeNames[tt]; exists {
  125. return name
  126. }
  127. return fmt.Sprintf("TokenType(%d)", int(tt))
  128. }
  129. var tokenTypeNames = map[TokenType]string{
  130. TokenEOF: "EOF",
  131. TokenNewline: "NEWLINE",
  132. TokenSpace: "SPACE",
  133. TokenComment: "COMMENT",
  134. TokenGraph: "GRAPH",
  135. TokenSubgraph: "SUBGRAPH",
  136. TokenEnd: "END",
  137. TokenDirection: "DIRECTION",
  138. TokenClass: "CLASS",
  139. TokenClassDef: "CLASSDEF",
  140. TokenClick: "CLICK",
  141. TokenStyle: "STYLE",
  142. TokenLinkStyle: "LINKSTYLE",
  143. TokenDefault: "DEFAULT",
  144. TokenTD: "TD",
  145. TokenTB: "TB",
  146. TokenBT: "BT",
  147. TokenRL: "RL",
  148. TokenLR: "LR",
  149. TokenEntry: "ENTRY",
  150. TokenExit: "EXIT",
  151. TokenDo: "DO",
  152. TokenID: "ID",
  153. TokenString: "STRING",
  154. TokenNodeString: "NODE_STRING",
  155. TokenNumber: "NUMBER",
  156. TokenUnicodeText: "UNICODE_TEXT",
  157. TokenOpenBracket: "OPEN_BRACKET",
  158. TokenCloseBracket: "CLOSE_BRACKET",
  159. TokenOpenParen: "OPEN_PAREN",
  160. TokenCloseParen: "CLOSE_PAREN",
  161. TokenOpenBrace: "OPEN_BRACE",
  162. TokenCloseBrace: "CLOSE_BRACE",
  163. TokenOpenDoubleParen: "OPEN_DOUBLE_PAREN",
  164. TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN",
  165. TokenOpenAngle: "OPEN_ANGLE",
  166. TokenCloseAngle: "CLOSE_ANGLE",
  167. TokenArrowSolid: "ARROW_SOLID",
  168. TokenArrowDotted: "ARROW_DOTTED",
  169. TokenArrowThick: "ARROW_THICK",
  170. TokenArrowOpen: "ARROW_OPEN",
  171. TokenArrowPoint: "ARROW_POINT",
  172. TokenArrowCross: "ARROW_CROSS",
  173. TokenArrowCircle: "ARROW_CIRCLE",
  174. TokenEROneToMany: "ER_ONE_TO_MANY",
  175. TokenEROneToManyAlt: "ER_ONE_TO_MANY_ALT",
  176. TokenERManyToOne: "ER_MANY_TO_ONE",
  177. TokenEROneToOne: "ER_ONE_TO_ONE",
  178. TokenERManyToMany: "ER_MANY_TO_MANY",
  179. TokenERManyToManyAlt: "ER_MANY_TO_MANY_ALT",
  180. TokenERZeroToOne: "ER_ZERO_TO_ONE",
  181. TokenClassInheritance: "CLASS_INHERITANCE",
  182. TokenClassComposition: "CLASS_COMPOSITION",
  183. TokenClassAggregation: "CLASS_AGGREGATION",
  184. TokenClassAssociation: "CLASS_ASSOCIATION",
  185. TokenClassRealization: "CLASS_REALIZATION",
  186. TokenClassDependency: "CLASS_DEPENDENCY",
  187. TokenERCardOnlyOne: "ER_CARD_ONLY_ONE",
  188. TokenERCardZeroOrOne: "ER_CARD_ZERO_OR_ONE",
  189. TokenERCardZeroOrMore: "ER_CARD_ZERO_OR_MORE",
  190. TokenERCardOneOrMore: "ER_CARD_ONE_OR_MORE",
  191. TokenERCardCloseOne: "ER_CARD_CLOSE_ONE",
  192. TokenERCardCloseZero: "ER_CARD_CLOSE_ZERO",
  193. TokenPipe: "PIPE",
  194. TokenColon: "COLON",
  195. TokenSemicolon: "SEMICOLON",
  196. TokenComma: "COMMA",
  197. TokenAmpersand: "AMPERSAND",
  198. TokenMult: "MULT",
  199. TokenPlus: "PLUS",
  200. TokenMinus: "MINUS",
  201. TokenEquals: "EQUALS",
  202. TokenDot: "DOT",
  203. TokenExclamation: "EXCLAMATION",
  204. TokenSlash: "SLASH",
  205. TokenBackslash: "BACKSLASH",
  206. TokenHash: "HASH",
  207. TokenAt: "AT",
  208. TokenPercent: "PERCENT",
  209. TokenTilde: "TILDE",
  210. TokenQuestion: "QUESTION",
  211. TokenError: "ERROR",
  212. }
  213. // Lexer performs lexical analysis on mermaid input
  214. type Lexer struct {
  215. input string
  216. position int
  217. line int
  218. column int
  219. tokens []Token
  220. }
  221. // NewLexer creates a new lexer for the given input
  222. func NewLexer(input string) *Lexer {
  223. return &Lexer{
  224. input: input,
  225. line: 1,
  226. column: 1,
  227. tokens: make([]Token, 0),
  228. }
  229. }
  230. // Tokenize performs lexical analysis and returns all tokens
  231. func (l *Lexer) Tokenize() ([]Token, error) {
  232. for l.position < len(l.input) {
  233. if err := l.nextToken(); err != nil {
  234. return nil, err
  235. }
  236. }
  237. // Add EOF token
  238. l.addToken(TokenEOF, "")
  239. return l.tokens, nil
  240. }
  241. // nextToken processes the next token from input
  242. func (l *Lexer) nextToken() error {
  243. if l.position >= len(l.input) {
  244. return nil
  245. }
  246. ch := l.current()
  247. // Skip whitespace but track newlines
  248. if unicode.IsSpace(ch) {
  249. return l.consumeWhitespace()
  250. }
  251. // Comments - following mermaid.js pattern
  252. if ch == '%' && l.peek() == '%' {
  253. return l.consumeComment()
  254. }
  255. // Multi-character operators first (order matters!)
  256. if multiChar := l.tryMultiCharOperator(); multiChar != TokenError {
  257. return nil
  258. }
  259. // Keywords and identifiers - support Unicode letters including Chinese
  260. if l.isUnicodeIdentifierStart(ch) {
  261. return l.consumeIdentifier()
  262. }
  263. // Numbers
  264. if unicode.IsDigit(ch) {
  265. return l.consumeNumber()
  266. }
  267. // Strings
  268. if ch == '"' {
  269. return l.consumeString()
  270. }
  271. // Single character tokens
  272. return l.consumeSingleChar()
  273. }
  274. // tryMultiCharOperator attempts to match multi-character operators
  275. func (l *Lexer) tryMultiCharOperator() TokenType {
  276. // Check for state diagram special markers first
  277. if l.matchString("<<fork>>") {
  278. l.addTokenAndAdvance(TokenID, "<<fork>>", 8)
  279. return TokenID
  280. }
  281. if l.matchString("<<join>>") {
  282. l.addTokenAndAdvance(TokenID, "<<join>>", 8)
  283. return TokenID
  284. }
  285. if l.matchString("<<choice>>") {
  286. l.addTokenAndAdvance(TokenID, "<<choice>>", 10)
  287. return TokenID
  288. }
  289. if l.matchString("<<history>>") {
  290. l.addTokenAndAdvance(TokenID, "<<history>>", 11)
  291. return TokenID
  292. }
  293. if l.matchString("<<deepHistory>>") {
  294. l.addTokenAndAdvance(TokenID, "<<deepHistory>>", 15)
  295. return TokenID
  296. }
  297. // Check for ER diagram relationships first (need to be before shorter patterns)
  298. if l.matchString("||--o{") {
  299. l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6)
  300. return TokenEROneToMany
  301. }
  302. if l.matchString("}o--||") {
  303. l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6)
  304. return TokenERManyToOne
  305. }
  306. if l.matchString("||--||") {
  307. l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6)
  308. return TokenEROneToOne
  309. }
  310. if l.matchString("}o--o{") {
  311. l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6)
  312. return TokenERManyToMany
  313. }
  314. if l.matchString("||--o|") {
  315. l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6)
  316. return TokenERZeroToOne
  317. }
  318. if l.matchString("||--|{") {
  319. l.addTokenAndAdvance(TokenEROneToManyAlt, "||--|{", 6)
  320. return TokenEROneToManyAlt
  321. }
  322. if l.matchString("}|..|{") {
  323. l.addTokenAndAdvance(TokenERManyToManyAlt, "}|..|{", 6)
  324. return TokenERManyToManyAlt
  325. }
  326. // Check for class diagram relationship symbols
  327. if l.matchString("<|--") {
  328. l.addTokenAndAdvance(TokenClassInheritance, "<|--", 4)
  329. return TokenClassInheritance
  330. }
  331. if l.matchString("*--") {
  332. l.addTokenAndAdvance(TokenClassComposition, "*--", 3)
  333. return TokenClassComposition
  334. }
  335. if l.matchString("o--") {
  336. l.addTokenAndAdvance(TokenClassAggregation, "o--", 3)
  337. return TokenClassAggregation
  338. }
  339. if l.matchString("-->") {
  340. l.addTokenAndAdvance(TokenClassAssociation, "-->", 3)
  341. return TokenClassAssociation
  342. }
  343. if l.matchString("..|>") {
  344. l.addTokenAndAdvance(TokenClassRealization, "..|>", 4)
  345. return TokenClassRealization
  346. }
  347. if l.matchString("..>") {
  348. l.addTokenAndAdvance(TokenClassDependency, "..>", 3)
  349. return TokenClassDependency
  350. }
  351. // Check for ER diagram cardinality symbols
  352. if l.matchString("||") {
  353. l.addTokenAndAdvance(TokenERCardOnlyOne, "||", 2)
  354. return TokenERCardOnlyOne
  355. }
  356. if l.matchString("o{") {
  357. l.addTokenAndAdvance(TokenERCardZeroOrMore, "o{", 2)
  358. return TokenERCardZeroOrMore
  359. }
  360. if l.matchString("|{") {
  361. l.addTokenAndAdvance(TokenERCardOneOrMore, "|{", 2)
  362. return TokenERCardOneOrMore
  363. }
  364. if l.matchString("}|") {
  365. l.addTokenAndAdvance(TokenERCardCloseOne, "}|", 2)
  366. return TokenERCardCloseOne
  367. }
  368. if l.matchString("}o") {
  369. l.addTokenAndAdvance(TokenERCardCloseZero, "}o", 2)
  370. return TokenERCardCloseZero
  371. }
  372. // Check for special diagram type declarations first
  373. if l.matchString("stateDiagram-v2") {
  374. l.addTokenAndAdvance(TokenID, "stateDiagram-v2", 15)
  375. return TokenID
  376. }
  377. if l.matchString("stateDiagram") {
  378. l.addTokenAndAdvance(TokenID, "stateDiagram", 12)
  379. return TokenID
  380. }
  381. if l.matchString("sequenceDiagram") {
  382. l.addTokenAndAdvance(TokenID, "sequenceDiagram", 15)
  383. return TokenID
  384. }
  385. if l.matchString("erDiagram") {
  386. l.addTokenAndAdvance(TokenID, "erDiagram", 9)
  387. return TokenID
  388. }
  389. // Check for sequence diagram arrows (longer patterns first, following mermaidjs patterns)
  390. // Bidirectional arrows
  391. if l.matchString("<<-->>") {
  392. l.addTokenAndAdvance(TokenArrowDotted, "<<-->>", 6)
  393. return TokenArrowDotted
  394. }
  395. if l.matchString("<<->>") {
  396. l.addTokenAndAdvance(TokenArrowSolid, "<<->>", 5)
  397. return TokenArrowSolid
  398. }
  399. // Complex arrows with directional markers
  400. if l.matchString("--|\\") {
  401. l.addTokenAndAdvance(TokenArrowSolid, "--|\\", 4)
  402. return TokenArrowSolid
  403. }
  404. if l.matchString("--|/") {
  405. l.addTokenAndAdvance(TokenArrowSolid, "--|/", 4)
  406. return TokenArrowSolid
  407. }
  408. if l.matchString("-|\\") {
  409. l.addTokenAndAdvance(TokenArrowSolid, "-|\\", 3)
  410. return TokenArrowSolid
  411. }
  412. if l.matchString("-|/") {
  413. l.addTokenAndAdvance(TokenArrowSolid, "-|/", 3)
  414. return TokenArrowSolid
  415. }
  416. // Standard arrows
  417. if l.matchString("-->>") {
  418. l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4)
  419. return TokenArrowDotted
  420. }
  421. if l.matchString("->>") {
  422. l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
  423. return TokenArrowSolid
  424. }
  425. if l.matchString("<->") {
  426. l.addTokenAndAdvance(TokenArrowSolid, "<->", 3)
  427. return TokenArrowSolid
  428. }
  429. if l.matchString("-x") {
  430. l.addTokenAndAdvance(TokenArrowCross, "-x", 2)
  431. return TokenArrowCross
  432. }
  433. if l.matchString("--)") {
  434. l.addTokenAndAdvance(TokenArrowOpen, "--)", 3)
  435. return TokenArrowOpen
  436. }
  437. if l.matchString("->") {
  438. l.addTokenAndAdvance(TokenArrowSolid, "->", 2)
  439. return TokenArrowSolid
  440. }
  441. if l.matchString("-)") {
  442. l.addTokenAndAdvance(TokenArrowOpen, "-)", 2)
  443. return TokenArrowOpen
  444. }
  445. // Check for arrows - based on destructLink patterns
  446. if l.matchString("==>") {
  447. l.addTokenAndAdvance(TokenArrowThick, "==>", 3)
  448. return TokenArrowThick
  449. }
  450. if l.matchString("-->") {
  451. l.addTokenAndAdvance(TokenArrowSolid, "-->", 3)
  452. return TokenArrowSolid
  453. }
  454. if l.matchString("-.->") {
  455. l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4)
  456. return TokenArrowDotted
  457. }
  458. if l.matchString("--x") {
  459. l.addTokenAndAdvance(TokenArrowCross, "--x", 3)
  460. return TokenArrowCross
  461. }
  462. if l.matchString("--o") {
  463. l.addTokenAndAdvance(TokenArrowCircle, "--o", 3)
  464. return TokenArrowCircle
  465. }
  466. if l.matchString("---") {
  467. l.addTokenAndAdvance(TokenArrowOpen, "---", 3)
  468. return TokenArrowOpen
  469. }
  470. if l.matchString("((") {
  471. l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2)
  472. return TokenOpenDoubleParen
  473. }
  474. if l.matchString("))") {
  475. l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2)
  476. return TokenCloseDoubleParen
  477. }
  478. // Check for single character ER cardinality symbols
  479. if l.matchString("o") && !l.isNextChar('{') && !l.isNextChar('-') && !l.isNextCharLetter() {
  480. l.addTokenAndAdvance(TokenERCardZeroOrOne, "o", 1)
  481. return TokenERCardZeroOrOne
  482. }
  483. return TokenError
  484. }
  485. // consumeWhitespace consumes whitespace characters
  486. func (l *Lexer) consumeWhitespace() error {
  487. start := l.position
  488. for l.position < len(l.input) && unicode.IsSpace(l.current()) {
  489. if l.current() == '\n' {
  490. l.line++
  491. l.column = 1
  492. l.position++
  493. // Add newline token for significant newlines
  494. if start < l.position-1 {
  495. l.addToken(TokenSpace, l.input[start:l.position-1])
  496. }
  497. l.addToken(TokenNewline, "\n")
  498. return nil
  499. } else {
  500. l.advance()
  501. }
  502. }
  503. if start < l.position {
  504. l.addToken(TokenSpace, l.input[start:l.position])
  505. }
  506. return nil
  507. }
  508. // consumeComment consumes a comment line
  509. func (l *Lexer) consumeComment() error {
  510. start := l.position
  511. // Skip %%
  512. l.advance()
  513. l.advance()
  514. // Read until end of line
  515. for l.position < len(l.input) && l.current() != '\n' {
  516. l.advance()
  517. }
  518. l.addToken(TokenComment, l.input[start:l.position])
  519. return nil
  520. }
  521. // consumeIdentifier consumes identifiers and keywords with Unicode support
  522. func (l *Lexer) consumeIdentifier() error {
  523. start := l.position
  524. // First character already validated
  525. l.advance()
  526. // Continue with Unicode identifier characters
  527. for l.position < len(l.input) {
  528. ch := l.current()
  529. if l.isUnicodeIdentifierChar(ch) {
  530. l.advance()
  531. } else {
  532. break
  533. }
  534. }
  535. value := l.input[start:l.position]
  536. tokenType := l.getKeywordType(value)
  537. l.addToken(tokenType, value)
  538. return nil
  539. }
  540. // getKeywordType returns the token type for keywords, or TokenID for identifiers
  541. func (l *Lexer) getKeywordType(value string) TokenType {
  542. // Keywords from flow.jison
  543. switch strings.ToLower(value) {
  544. case "graph":
  545. return TokenGraph
  546. case "flowchart":
  547. return TokenGraph // flowchart uses same token as graph
  548. case "subgraph":
  549. return TokenSubgraph
  550. case "end":
  551. return TokenEnd
  552. case "class":
  553. return TokenClass
  554. case "classdef":
  555. return TokenClassDef
  556. case "click":
  557. return TokenClick
  558. case "style":
  559. return TokenStyle
  560. case "linkstyle":
  561. return TokenLinkStyle
  562. case "default":
  563. return TokenDefault
  564. // Direction keywords
  565. case "td":
  566. return TokenTD
  567. case "tb":
  568. return TokenTB
  569. case "bt":
  570. return TokenBT
  571. case "rl":
  572. return TokenRL
  573. case "lr":
  574. return TokenLR
  575. // State diagram keywords
  576. case "state":
  577. return TokenID
  578. case "diagram":
  579. return TokenID
  580. case "statediagram-v2":
  581. return TokenID
  582. case "statediagram":
  583. return TokenID
  584. case "entry":
  585. return TokenEntry
  586. case "exit":
  587. return TokenExit
  588. case "do":
  589. return TokenDo
  590. case "pk":
  591. return TokenID
  592. case "fk":
  593. return TokenID
  594. case "uk":
  595. return TokenID
  596. default:
  597. return TokenID
  598. }
  599. }
  600. // consumeNumber consumes numeric literals
  601. func (l *Lexer) consumeNumber() error {
  602. start := l.position
  603. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  604. l.advance()
  605. }
  606. // Handle decimal point
  607. if l.position < len(l.input) && l.current() == '.' {
  608. l.advance()
  609. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  610. l.advance()
  611. }
  612. }
  613. // Check for duration suffixes (d, w, m, y, h)
  614. if l.position < len(l.input) {
  615. ch := l.current()
  616. if ch == 'd' || ch == 'w' || ch == 'm' || ch == 'y' || ch == 'h' {
  617. l.advance()
  618. }
  619. }
  620. // Check for date format (YYYY-MM-DD) - look for additional numbers with dashes
  621. if l.position < len(l.input) && l.current() == '-' {
  622. l.advance() // consume the dash
  623. // Look for more digits after dash
  624. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  625. l.advance()
  626. }
  627. // Look for another dash and more digits
  628. if l.position < len(l.input) && l.current() == '-' {
  629. l.advance() // consume the second dash
  630. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  631. l.advance()
  632. }
  633. }
  634. }
  635. l.addToken(TokenNumber, l.input[start:l.position])
  636. return nil
  637. }
  638. // consumeString consumes quoted string literals
  639. func (l *Lexer) consumeString() error {
  640. start := l.position
  641. // Skip opening quote
  642. l.advance()
  643. for l.position < len(l.input) && l.current() != '"' {
  644. if l.current() == '\\' && l.position+1 < len(l.input) {
  645. // Skip escaped character
  646. l.advance()
  647. l.advance()
  648. } else {
  649. l.advance()
  650. }
  651. }
  652. if l.position >= len(l.input) {
  653. return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column)
  654. }
  655. // Skip closing quote
  656. l.advance()
  657. l.addToken(TokenString, l.input[start:l.position])
  658. return nil
  659. }
  660. // consumeSingleChar handles single character tokens
  661. func (l *Lexer) consumeSingleChar() error {
  662. ch := l.current()
  663. var tokenType TokenType
  664. switch ch {
  665. case '[':
  666. tokenType = TokenOpenBracket
  667. case ']':
  668. tokenType = TokenCloseBracket
  669. case '(':
  670. tokenType = TokenOpenParen
  671. case ')':
  672. tokenType = TokenCloseParen
  673. case '{':
  674. tokenType = TokenOpenBrace
  675. case '}':
  676. tokenType = TokenCloseBrace
  677. case '<':
  678. tokenType = TokenOpenAngle
  679. case '>':
  680. tokenType = TokenCloseAngle
  681. case '|':
  682. tokenType = TokenPipe
  683. case ':':
  684. tokenType = TokenColon
  685. case ';':
  686. tokenType = TokenSemicolon
  687. case ',':
  688. tokenType = TokenComma
  689. case '&':
  690. tokenType = TokenAmpersand
  691. case '*':
  692. tokenType = TokenMult
  693. case '+':
  694. tokenType = TokenPlus
  695. case '-':
  696. tokenType = TokenMinus
  697. case '=':
  698. tokenType = TokenEquals
  699. case '.':
  700. tokenType = TokenDot
  701. case '!':
  702. tokenType = TokenExclamation
  703. case '/':
  704. tokenType = TokenSlash
  705. case '\\':
  706. tokenType = TokenBackslash
  707. case '#':
  708. tokenType = TokenHash
  709. case '@':
  710. tokenType = TokenAt
  711. case '%':
  712. tokenType = TokenPercent
  713. case '~':
  714. tokenType = TokenTilde
  715. case '?':
  716. tokenType = TokenQuestion
  717. default:
  718. return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column)
  719. }
  720. l.addTokenAndAdvance(tokenType, string(ch), 1)
  721. return nil
  722. }
  723. // Helper methods
  724. // current returns the current character
  725. func (l *Lexer) current() rune {
  726. if l.position >= len(l.input) {
  727. return 0
  728. }
  729. r, _ := utf8.DecodeRuneInString(l.input[l.position:])
  730. return r
  731. }
  732. // peek returns the next character without advancing
  733. func (l *Lexer) peek() rune {
  734. if l.position >= len(l.input) {
  735. return 0
  736. }
  737. _, size := utf8.DecodeRuneInString(l.input[l.position:])
  738. if l.position+size >= len(l.input) {
  739. return 0
  740. }
  741. nextR, _ := utf8.DecodeRuneInString(l.input[l.position+size:])
  742. return nextR
  743. }
  744. // advance moves to the next character
  745. func (l *Lexer) advance() {
  746. if l.position < len(l.input) {
  747. r, size := utf8.DecodeRuneInString(l.input[l.position:])
  748. if r == '\n' {
  749. l.line++
  750. l.column = 1
  751. } else {
  752. l.column++
  753. }
  754. l.position += size
  755. }
  756. }
  757. // matchString checks if the input matches the given string at current position
  758. func (l *Lexer) matchString(s string) bool {
  759. if l.position+len(s) > len(l.input) {
  760. return false
  761. }
  762. return l.input[l.position:l.position+len(s)] == s
  763. }
  764. // isNextChar checks if the next character matches the given character
  765. func (l *Lexer) isNextChar(ch byte) bool {
  766. if l.position+1 >= len(l.input) {
  767. return false
  768. }
  769. return l.input[l.position+1] == ch
  770. }
  771. // isNextCharLetter checks if the next character is a letter
  772. func (l *Lexer) isNextCharLetter() bool {
  773. if l.position+1 >= len(l.input) {
  774. return false
  775. }
  776. return unicode.IsLetter(rune(l.input[l.position+1]))
  777. }
  778. // addToken adds a token to the token list
  779. func (l *Lexer) addToken(tokenType TokenType, value string) {
  780. runeCount := utf8.RuneCountInString(value)
  781. byteCount := len(value)
  782. token := Token{
  783. Type: tokenType,
  784. Value: value,
  785. Line: l.line,
  786. Column: l.column - runeCount,
  787. Position: l.position - byteCount,
  788. }
  789. l.tokens = append(l.tokens, token)
  790. }
  791. // addTokenAndAdvance adds a token and advances position by the specified number of runes
  792. func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, runeCount int) {
  793. l.addToken(tokenType, value)
  794. for i := 0; i < runeCount; i++ {
  795. l.advance()
  796. }
  797. }
  798. // isChineseChar checks if a character is a Chinese character
  799. func (l *Lexer) isChineseChar(ch rune) bool {
  800. // Chinese character ranges in Unicode
  801. return (ch >= 0x4e00 && ch <= 0x9fff) || // CJK Unified Ideographs
  802. (ch >= 0x3400 && ch <= 0x4dbf) || // CJK Extension A
  803. (ch >= 0x20000 && ch <= 0x2a6df) || // CJK Extension B
  804. (ch >= 0x2a700 && ch <= 0x2b73f) || // CJK Extension C
  805. (ch >= 0x2b740 && ch <= 0x2b81f) || // CJK Extension D
  806. (ch >= 0x2b820 && ch <= 0x2ceaf) || // CJK Extension E
  807. (ch >= 0x2ceb0 && ch <= 0x2ebef) || // CJK Extension F
  808. (ch >= 0x30000 && ch <= 0x3134f) || // CJK Extension G
  809. (ch >= 0x3190 && ch <= 0x319f) || // Kanbun
  810. (ch >= 0x31c0 && ch <= 0x31ef) || // CJK Strokes
  811. (ch >= 0x2e80 && ch <= 0x2eff) || // CJK Radicals Supplement
  812. (ch >= 0x2f00 && ch <= 0x2fdf) || // Kangxi Radicals
  813. (ch >= 0x2ff0 && ch <= 0x2fff) || // Ideographic Description Characters
  814. (ch >= 0x3000 && ch <= 0x303f) || // CJK Symbols and Punctuation
  815. (ch >= 0x3040 && ch <= 0x309f) || // Hiragana
  816. (ch >= 0x30a0 && ch <= 0x30ff) || // Katakana
  817. (ch >= 0x3100 && ch <= 0x312f) || // Bopomofo
  818. (ch >= 0x3130 && ch <= 0x318f) || // Hangul Compatibility Jamo
  819. (ch >= 0x31a0 && ch <= 0x31bf) || // Bopomofo Extended
  820. (ch >= 0xac00 && ch <= 0xd7af) || // Hangul Syllables
  821. (ch >= 0xff00 && ch <= 0xffef) // Halfwidth and Fullwidth Forms
  822. }
  823. // isUnicodeIdentifierStart checks if a character can start a Unicode identifier
  824. func (l *Lexer) isUnicodeIdentifierStart(ch rune) bool {
  825. return unicode.IsLetter(ch) || ch == '_' || l.isChineseChar(ch)
  826. }
  827. // isUnicodeIdentifierChar checks if a character can be part of a Unicode identifier
  828. func (l *Lexer) isUnicodeIdentifierChar(ch rune) bool {
  829. return unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' || ch == '-' || l.isChineseChar(ch)
  830. }
  831. // FilterTokens removes whitespace and comment tokens for parsing
  832. func FilterTokens(tokens []Token) []Token {
  833. filtered := make([]Token, 0, len(tokens))
  834. for _, token := range tokens {
  835. if token.Type != TokenSpace && token.Type != TokenComment {
  836. filtered = append(filtered, token)
  837. }
  838. }
  839. return filtered
  840. }