lexer.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. // Package lexer provides lexical analysis for Mermaid diagram syntax.
  2. // Based on the lexical rules from flow.jison in mermaid.js
  3. package lexer
  4. import (
  5. "fmt"
  6. _ "regexp"
  7. "strings"
  8. "unicode"
  9. )
  10. // TokenType represents the type of a lexical token
  11. type TokenType int
  12. const (
  13. // Special tokens
  14. TokenEOF TokenType = iota
  15. TokenNewline
  16. TokenSpace
  17. TokenComment
  18. // Keywords - from flow.jison
  19. TokenGraph
  20. TokenSubgraph
  21. TokenEnd
  22. TokenDirection
  23. TokenClass
  24. TokenClassDef
  25. TokenClick
  26. TokenStyle
  27. TokenLinkStyle
  28. TokenDefault
  29. // Directions
  30. TokenTD // Top Down
  31. TokenTB // Top Bottom
  32. TokenBT // Bottom Top
  33. TokenRL // Right Left
  34. TokenLR // Left Right
  35. // Identifiers and literals
  36. TokenID
  37. TokenString
  38. TokenNodeString
  39. TokenNumber
  40. TokenUnicodeText
  41. // Shape delimiters - following JISON patterns
  42. TokenOpenBracket // [
  43. TokenCloseBracket // ]
  44. TokenOpenParen // (
  45. TokenCloseParen // )
  46. TokenOpenBrace // {
  47. TokenCloseBrace // }
  48. TokenOpenDoubleParen // ((
  49. TokenCloseDoubleParen // ))
  50. TokenOpenAngle // <
  51. TokenCloseAngle // >
  52. // Edge tokens - from destructLink logic in flowDb.ts
  53. TokenArrowSolid // -->
  54. TokenArrowDotted // -.->
  55. TokenArrowThick // ==>
  56. TokenArrowOpen // ---
  57. TokenArrowPoint // -->
  58. TokenArrowCross // --x
  59. TokenArrowCircle // --o
  60. // ER diagram relationship tokens
  61. TokenEROneToMany // ||--o{
  62. TokenERManyToOne // }o--||
  63. TokenEROneToOne // ||--||
  64. TokenERManyToMany // }o--o{
  65. TokenERZeroToOne // ||--o|
  66. // Edge modifiers
  67. TokenPipe // |
  68. TokenColon // :
  69. TokenSemicolon // ;
  70. TokenComma // ,
  71. TokenAmpersand // &
  72. TokenMult // *
  73. TokenPlus // +
  74. TokenMinus // -
  75. TokenEquals // =
  76. TokenDot // .
  77. TokenExclamation // !
  78. TokenSlash // /
  79. TokenBackslash // \
  80. TokenHash // #
  81. TokenAt // @
  82. TokenPercent // %
  83. TokenTilde // ~
  84. TokenQuestion // ?
  85. // Error token
  86. TokenError
  87. )
  88. // Token represents a lexical token
  89. type Token struct {
  90. Type TokenType
  91. Value string
  92. Line int
  93. Column int
  94. Position int
  95. }
  96. // String returns a string representation of the token
  97. func (t Token) String() string {
  98. return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}",
  99. t.Type.String(), t.Value, t.Line, t.Column)
  100. }
  101. // String returns the string representation of TokenType
  102. func (tt TokenType) String() string {
  103. if name, exists := tokenTypeNames[tt]; exists {
  104. return name
  105. }
  106. return fmt.Sprintf("TokenType(%d)", int(tt))
  107. }
  108. var tokenTypeNames = map[TokenType]string{
  109. TokenEOF: "EOF",
  110. TokenNewline: "NEWLINE",
  111. TokenSpace: "SPACE",
  112. TokenComment: "COMMENT",
  113. TokenGraph: "GRAPH",
  114. TokenSubgraph: "SUBGRAPH",
  115. TokenEnd: "END",
  116. TokenDirection: "DIRECTION",
  117. TokenClass: "CLASS",
  118. TokenClassDef: "CLASSDEF",
  119. TokenClick: "CLICK",
  120. TokenStyle: "STYLE",
  121. TokenLinkStyle: "LINKSTYLE",
  122. TokenDefault: "DEFAULT",
  123. TokenTD: "TD",
  124. TokenTB: "TB",
  125. TokenBT: "BT",
  126. TokenRL: "RL",
  127. TokenLR: "LR",
  128. TokenID: "ID",
  129. TokenString: "STRING",
  130. TokenNodeString: "NODE_STRING",
  131. TokenNumber: "NUMBER",
  132. TokenUnicodeText: "UNICODE_TEXT",
  133. TokenOpenBracket: "OPEN_BRACKET",
  134. TokenCloseBracket: "CLOSE_BRACKET",
  135. TokenOpenParen: "OPEN_PAREN",
  136. TokenCloseParen: "CLOSE_PAREN",
  137. TokenOpenBrace: "OPEN_BRACE",
  138. TokenCloseBrace: "CLOSE_BRACE",
  139. TokenOpenDoubleParen: "OPEN_DOUBLE_PAREN",
  140. TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN",
  141. TokenOpenAngle: "OPEN_ANGLE",
  142. TokenCloseAngle: "CLOSE_ANGLE",
  143. TokenArrowSolid: "ARROW_SOLID",
  144. TokenArrowDotted: "ARROW_DOTTED",
  145. TokenArrowThick: "ARROW_THICK",
  146. TokenArrowOpen: "ARROW_OPEN",
  147. TokenArrowPoint: "ARROW_POINT",
  148. TokenArrowCross: "ARROW_CROSS",
  149. TokenArrowCircle: "ARROW_CIRCLE",
  150. TokenEROneToMany: "ER_ONE_TO_MANY",
  151. TokenERManyToOne: "ER_MANY_TO_ONE",
  152. TokenEROneToOne: "ER_ONE_TO_ONE",
  153. TokenERManyToMany: "ER_MANY_TO_MANY",
  154. TokenERZeroToOne: "ER_ZERO_TO_ONE",
  155. TokenPipe: "PIPE",
  156. TokenColon: "COLON",
  157. TokenSemicolon: "SEMICOLON",
  158. TokenComma: "COMMA",
  159. TokenAmpersand: "AMPERSAND",
  160. TokenMult: "MULT",
  161. TokenPlus: "PLUS",
  162. TokenMinus: "MINUS",
  163. TokenEquals: "EQUALS",
  164. TokenDot: "DOT",
  165. TokenExclamation: "EXCLAMATION",
  166. TokenSlash: "SLASH",
  167. TokenBackslash: "BACKSLASH",
  168. TokenHash: "HASH",
  169. TokenAt: "AT",
  170. TokenPercent: "PERCENT",
  171. TokenTilde: "TILDE",
  172. TokenQuestion: "QUESTION",
  173. TokenError: "ERROR",
  174. }
  175. // Lexer performs lexical analysis on mermaid input
  176. type Lexer struct {
  177. input string
  178. position int
  179. line int
  180. column int
  181. tokens []Token
  182. }
  183. // NewLexer creates a new lexer for the given input
  184. func NewLexer(input string) *Lexer {
  185. return &Lexer{
  186. input: input,
  187. line: 1,
  188. column: 1,
  189. tokens: make([]Token, 0),
  190. }
  191. }
  192. // Tokenize performs lexical analysis and returns all tokens
  193. func (l *Lexer) Tokenize() ([]Token, error) {
  194. for l.position < len(l.input) {
  195. if err := l.nextToken(); err != nil {
  196. return nil, err
  197. }
  198. }
  199. // Add EOF token
  200. l.addToken(TokenEOF, "")
  201. return l.tokens, nil
  202. }
  203. // nextToken processes the next token from input
  204. func (l *Lexer) nextToken() error {
  205. if l.position >= len(l.input) {
  206. return nil
  207. }
  208. ch := l.current()
  209. // Skip whitespace but track newlines
  210. if unicode.IsSpace(ch) {
  211. return l.consumeWhitespace()
  212. }
  213. // Comments - following mermaid.js pattern
  214. if ch == '%' && l.peek() == '%' {
  215. return l.consumeComment()
  216. }
  217. // Multi-character operators first (order matters!)
  218. if multiChar := l.tryMultiCharOperator(); multiChar != TokenError {
  219. return nil
  220. }
  221. // Keywords and identifiers
  222. if unicode.IsLetter(ch) || ch == '_' {
  223. return l.consumeIdentifier()
  224. }
  225. // Numbers
  226. if unicode.IsDigit(ch) {
  227. return l.consumeNumber()
  228. }
  229. // Strings
  230. if ch == '"' {
  231. return l.consumeString()
  232. }
  233. // Single character tokens
  234. return l.consumeSingleChar()
  235. }
  236. // tryMultiCharOperator attempts to match multi-character operators
  237. func (l *Lexer) tryMultiCharOperator() TokenType {
  238. // Check for ER diagram relationships first (need to be before shorter patterns)
  239. if l.matchString("||--o{") {
  240. l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6)
  241. return TokenEROneToMany
  242. }
  243. if l.matchString("}o--||") {
  244. l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6)
  245. return TokenERManyToOne
  246. }
  247. if l.matchString("||--||") {
  248. l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6)
  249. return TokenEROneToOne
  250. }
  251. if l.matchString("}o--o{") {
  252. l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6)
  253. return TokenERManyToMany
  254. }
  255. if l.matchString("||--o|") {
  256. l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6)
  257. return TokenERZeroToOne
  258. }
  259. // Check for sequence diagram arrows
  260. if l.matchString("->>") {
  261. l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
  262. return TokenArrowSolid
  263. }
  264. if l.matchString("-->>") {
  265. l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4)
  266. return TokenArrowDotted
  267. }
  268. // Check for arrows - based on destructLink patterns
  269. if l.matchString("==>") {
  270. l.addTokenAndAdvance(TokenArrowThick, "==>", 3)
  271. return TokenArrowThick
  272. }
  273. if l.matchString("-->") {
  274. l.addTokenAndAdvance(TokenArrowSolid, "-->", 3)
  275. return TokenArrowSolid
  276. }
  277. if l.matchString("-.->") {
  278. l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4)
  279. return TokenArrowDotted
  280. }
  281. if l.matchString("--x") {
  282. l.addTokenAndAdvance(TokenArrowCross, "--x", 3)
  283. return TokenArrowCross
  284. }
  285. if l.matchString("--o") {
  286. l.addTokenAndAdvance(TokenArrowCircle, "--o", 3)
  287. return TokenArrowCircle
  288. }
  289. if l.matchString("---") {
  290. l.addTokenAndAdvance(TokenArrowOpen, "---", 3)
  291. return TokenArrowOpen
  292. }
  293. if l.matchString("((") {
  294. l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2)
  295. return TokenOpenDoubleParen
  296. }
  297. if l.matchString("))") {
  298. l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2)
  299. return TokenCloseDoubleParen
  300. }
  301. return TokenError
  302. }
  303. // consumeWhitespace consumes whitespace characters
  304. func (l *Lexer) consumeWhitespace() error {
  305. start := l.position
  306. for l.position < len(l.input) && unicode.IsSpace(l.current()) {
  307. if l.current() == '\n' {
  308. l.line++
  309. l.column = 1
  310. l.position++
  311. // Add newline token for significant newlines
  312. if start < l.position-1 {
  313. l.addToken(TokenSpace, l.input[start:l.position-1])
  314. }
  315. l.addToken(TokenNewline, "\n")
  316. return nil
  317. } else {
  318. l.advance()
  319. }
  320. }
  321. if start < l.position {
  322. l.addToken(TokenSpace, l.input[start:l.position])
  323. }
  324. return nil
  325. }
  326. // consumeComment consumes a comment line
  327. func (l *Lexer) consumeComment() error {
  328. start := l.position
  329. // Skip %%
  330. l.advance()
  331. l.advance()
  332. // Read until end of line
  333. for l.position < len(l.input) && l.current() != '\n' {
  334. l.advance()
  335. }
  336. l.addToken(TokenComment, l.input[start:l.position])
  337. return nil
  338. }
  339. // consumeIdentifier consumes identifiers and keywords
  340. func (l *Lexer) consumeIdentifier() error {
  341. start := l.position
  342. // First character already validated
  343. l.advance()
  344. // Continue with alphanumeric and underscore
  345. for l.position < len(l.input) {
  346. ch := l.current()
  347. if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' {
  348. l.advance()
  349. } else {
  350. break
  351. }
  352. }
  353. value := l.input[start:l.position]
  354. tokenType := l.getKeywordType(value)
  355. l.addToken(tokenType, value)
  356. return nil
  357. }
  358. // getKeywordType returns the token type for keywords, or TokenID for identifiers
  359. func (l *Lexer) getKeywordType(value string) TokenType {
  360. // Keywords from flow.jison
  361. switch strings.ToLower(value) {
  362. case "graph":
  363. return TokenGraph
  364. case "flowchart":
  365. return TokenGraph // flowchart uses same token as graph
  366. case "subgraph":
  367. return TokenSubgraph
  368. case "end":
  369. return TokenEnd
  370. case "class":
  371. return TokenClass
  372. case "classdef":
  373. return TokenClassDef
  374. case "click":
  375. return TokenClick
  376. case "style":
  377. return TokenStyle
  378. case "linkstyle":
  379. return TokenLinkStyle
  380. case "default":
  381. return TokenDefault
  382. // Direction keywords
  383. case "td":
  384. return TokenTD
  385. case "tb":
  386. return TokenTB
  387. case "bt":
  388. return TokenBT
  389. case "rl":
  390. return TokenRL
  391. case "lr":
  392. return TokenLR
  393. default:
  394. return TokenID
  395. }
  396. }
  397. // consumeNumber consumes numeric literals
  398. func (l *Lexer) consumeNumber() error {
  399. start := l.position
  400. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  401. l.advance()
  402. }
  403. // Handle decimal point
  404. if l.position < len(l.input) && l.current() == '.' {
  405. l.advance()
  406. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  407. l.advance()
  408. }
  409. }
  410. l.addToken(TokenNumber, l.input[start:l.position])
  411. return nil
  412. }
  413. // consumeString consumes quoted string literals
  414. func (l *Lexer) consumeString() error {
  415. start := l.position
  416. // Skip opening quote
  417. l.advance()
  418. for l.position < len(l.input) && l.current() != '"' {
  419. if l.current() == '\\' && l.position+1 < len(l.input) {
  420. // Skip escaped character
  421. l.advance()
  422. l.advance()
  423. } else {
  424. l.advance()
  425. }
  426. }
  427. if l.position >= len(l.input) {
  428. return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column)
  429. }
  430. // Skip closing quote
  431. l.advance()
  432. l.addToken(TokenString, l.input[start:l.position])
  433. return nil
  434. }
  435. // consumeSingleChar handles single character tokens
  436. func (l *Lexer) consumeSingleChar() error {
  437. ch := l.current()
  438. var tokenType TokenType
  439. switch ch {
  440. case '[':
  441. tokenType = TokenOpenBracket
  442. case ']':
  443. tokenType = TokenCloseBracket
  444. case '(':
  445. tokenType = TokenOpenParen
  446. case ')':
  447. tokenType = TokenCloseParen
  448. case '{':
  449. tokenType = TokenOpenBrace
  450. case '}':
  451. tokenType = TokenCloseBrace
  452. case '<':
  453. tokenType = TokenOpenAngle
  454. case '>':
  455. tokenType = TokenCloseAngle
  456. case '|':
  457. tokenType = TokenPipe
  458. case ':':
  459. tokenType = TokenColon
  460. case ';':
  461. tokenType = TokenSemicolon
  462. case ',':
  463. tokenType = TokenComma
  464. case '&':
  465. tokenType = TokenAmpersand
  466. case '*':
  467. tokenType = TokenMult
  468. case '+':
  469. tokenType = TokenPlus
  470. case '-':
  471. tokenType = TokenMinus
  472. case '=':
  473. tokenType = TokenEquals
  474. case '.':
  475. tokenType = TokenDot
  476. case '!':
  477. tokenType = TokenExclamation
  478. case '/':
  479. tokenType = TokenSlash
  480. case '\\':
  481. tokenType = TokenBackslash
  482. case '#':
  483. tokenType = TokenHash
  484. case '@':
  485. tokenType = TokenAt
  486. case '%':
  487. tokenType = TokenPercent
  488. case '~':
  489. tokenType = TokenTilde
  490. case '?':
  491. tokenType = TokenQuestion
  492. default:
  493. return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column)
  494. }
  495. l.addTokenAndAdvance(tokenType, string(ch), 1)
  496. return nil
  497. }
  498. // Helper methods
  499. // current returns the current character
  500. func (l *Lexer) current() rune {
  501. if l.position >= len(l.input) {
  502. return 0
  503. }
  504. return rune(l.input[l.position])
  505. }
  506. // peek returns the next character without advancing
  507. func (l *Lexer) peek() rune {
  508. if l.position+1 >= len(l.input) {
  509. return 0
  510. }
  511. return rune(l.input[l.position+1])
  512. }
  513. // advance moves to the next character
  514. func (l *Lexer) advance() {
  515. if l.position < len(l.input) {
  516. if l.input[l.position] == '\n' {
  517. l.line++
  518. l.column = 1
  519. } else {
  520. l.column++
  521. }
  522. l.position++
  523. }
  524. }
  525. // matchString checks if the input matches the given string at current position
  526. func (l *Lexer) matchString(s string) bool {
  527. if l.position+len(s) > len(l.input) {
  528. return false
  529. }
  530. return l.input[l.position:l.position+len(s)] == s
  531. }
  532. // addToken adds a token to the token list
  533. func (l *Lexer) addToken(tokenType TokenType, value string) {
  534. token := Token{
  535. Type: tokenType,
  536. Value: value,
  537. Line: l.line,
  538. Column: l.column - len(value),
  539. Position: l.position - len(value),
  540. }
  541. l.tokens = append(l.tokens, token)
  542. }
  543. // addTokenAndAdvance adds a token and advances position
  544. func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, length int) {
  545. l.addToken(tokenType, value)
  546. for i := 0; i < length; i++ {
  547. l.advance()
  548. }
  549. }
  550. // FilterTokens removes whitespace and comment tokens for parsing
  551. func FilterTokens(tokens []Token) []Token {
  552. filtered := make([]Token, 0, len(tokens))
  553. for _, token := range tokens {
  554. if token.Type != TokenSpace && token.Type != TokenComment {
  555. filtered = append(filtered, token)
  556. }
  557. }
  558. return filtered
  559. }