lexer.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. // Package lexer provides lexical analysis for Mermaid diagram syntax.
  2. // Based on the lexical rules from flow.jison in mermaid.js
  3. package lexer
  4. import (
  5. "fmt"
  6. _ "regexp"
  7. "strings"
  8. "unicode"
  9. )
  10. // TokenType represents the type of a lexical token
  11. type TokenType int
  12. const (
  13. // Special tokens
  14. TokenEOF TokenType = iota
  15. TokenNewline
  16. TokenSpace
  17. TokenComment
  18. // Keywords - from flow.jison
  19. TokenGraph
  20. TokenSubgraph
  21. TokenEnd
  22. TokenDirection
  23. TokenClass
  24. TokenClassDef
  25. TokenClick
  26. TokenStyle
  27. TokenLinkStyle
  28. TokenDefault
  29. // Directions
  30. TokenTD // Top Down
  31. TokenTB // Top Bottom
  32. TokenBT // Bottom Top
  33. TokenRL // Right Left
  34. TokenLR // Left Right
  35. // Identifiers and literals
  36. TokenID
  37. TokenString
  38. TokenNodeString
  39. TokenNumber
  40. TokenUnicodeText
  41. // Shape delimiters - following JISON patterns
  42. TokenOpenBracket // [
  43. TokenCloseBracket // ]
  44. TokenOpenParen // (
  45. TokenCloseParen // )
  46. TokenOpenBrace // {
  47. TokenCloseBrace // }
  48. TokenOpenDoubleParen // ((
  49. TokenCloseDoubleParen // ))
  50. TokenOpenAngle // <
  51. TokenCloseAngle // >
  52. // Edge tokens - from destructLink logic in flowDb.ts
  53. TokenArrowSolid // -->
  54. TokenArrowDotted // -.->
  55. TokenArrowThick // ==>
  56. TokenArrowOpen // ---
  57. TokenArrowPoint // -->
  58. TokenArrowCross // --x
  59. TokenArrowCircle // --o
  60. // ER diagram relationship tokens
  61. TokenEROneToMany // ||--o{
  62. TokenERManyToOne // }o--||
  63. TokenEROneToOne // ||--||
  64. TokenERManyToMany // }o--o{
  65. TokenERZeroToOne // ||--o|
  66. // Edge modifiers
  67. TokenPipe // |
  68. TokenColon // :
  69. TokenSemicolon // ;
  70. TokenComma // ,
  71. TokenAmpersand // &
  72. TokenMult // *
  73. TokenPlus // +
  74. TokenMinus // -
  75. TokenEquals // =
  76. TokenDot // .
  77. TokenExclamation // !
  78. TokenSlash // /
  79. TokenBackslash // \
  80. TokenHash // #
  81. TokenAt // @
  82. TokenPercent // %
  83. TokenTilde // ~
  84. TokenQuestion // ?
  85. // Error token
  86. TokenError
  87. )
  88. // Token represents a lexical token
  89. type Token struct {
  90. Type TokenType
  91. Value string
  92. Line int
  93. Column int
  94. Position int
  95. }
  96. // String returns a string representation of the token
  97. func (t Token) String() string {
  98. return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}",
  99. tokenTypeNames[t.Type], t.Value, t.Line, t.Column)
  100. }
  101. var tokenTypeNames = map[TokenType]string{
  102. TokenEOF: "EOF",
  103. TokenNewline: "NEWLINE",
  104. TokenSpace: "SPACE",
  105. TokenComment: "COMMENT",
  106. TokenGraph: "GRAPH",
  107. TokenSubgraph: "SUBGRAPH",
  108. TokenEnd: "END",
  109. TokenDirection: "DIRECTION",
  110. TokenClass: "CLASS",
  111. TokenClassDef: "CLASSDEF",
  112. TokenClick: "CLICK",
  113. TokenStyle: "STYLE",
  114. TokenLinkStyle: "LINKSTYLE",
  115. TokenDefault: "DEFAULT",
  116. TokenTD: "TD",
  117. TokenTB: "TB",
  118. TokenBT: "BT",
  119. TokenRL: "RL",
  120. TokenLR: "LR",
  121. TokenID: "ID",
  122. TokenString: "STRING",
  123. TokenNodeString: "NODE_STRING",
  124. TokenNumber: "NUMBER",
  125. TokenUnicodeText: "UNICODE_TEXT",
  126. TokenOpenBracket: "OPEN_BRACKET",
  127. TokenCloseBracket: "CLOSE_BRACKET",
  128. TokenOpenParen: "OPEN_PAREN",
  129. TokenCloseParen: "CLOSE_PAREN",
  130. TokenOpenBrace: "OPEN_BRACE",
  131. TokenCloseBrace: "CLOSE_BRACE",
  132. TokenOpenDoubleParen: "OPEN_DOUBLE_PAREN",
  133. TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN",
  134. TokenOpenAngle: "OPEN_ANGLE",
  135. TokenCloseAngle: "CLOSE_ANGLE",
  136. TokenArrowSolid: "ARROW_SOLID",
  137. TokenArrowDotted: "ARROW_DOTTED",
  138. TokenArrowThick: "ARROW_THICK",
  139. TokenArrowOpen: "ARROW_OPEN",
  140. TokenArrowPoint: "ARROW_POINT",
  141. TokenArrowCross: "ARROW_CROSS",
  142. TokenArrowCircle: "ARROW_CIRCLE",
  143. TokenEROneToMany: "ER_ONE_TO_MANY",
  144. TokenERManyToOne: "ER_MANY_TO_ONE",
  145. TokenEROneToOne: "ER_ONE_TO_ONE",
  146. TokenERManyToMany: "ER_MANY_TO_MANY",
  147. TokenERZeroToOne: "ER_ZERO_TO_ONE",
  148. TokenPipe: "PIPE",
  149. TokenColon: "COLON",
  150. TokenSemicolon: "SEMICOLON",
  151. TokenComma: "COMMA",
  152. TokenAmpersand: "AMPERSAND",
  153. TokenMult: "MULT",
  154. TokenPlus: "PLUS",
  155. TokenMinus: "MINUS",
  156. TokenEquals: "EQUALS",
  157. TokenDot: "DOT",
  158. TokenExclamation: "EXCLAMATION",
  159. TokenSlash: "SLASH",
  160. TokenBackslash: "BACKSLASH",
  161. TokenHash: "HASH",
  162. TokenAt: "AT",
  163. TokenPercent: "PERCENT",
  164. TokenTilde: "TILDE",
  165. TokenQuestion: "QUESTION",
  166. TokenError: "ERROR",
  167. }
  168. // Lexer performs lexical analysis on mermaid input
  169. type Lexer struct {
  170. input string
  171. position int
  172. line int
  173. column int
  174. tokens []Token
  175. }
  176. // NewLexer creates a new lexer for the given input
  177. func NewLexer(input string) *Lexer {
  178. return &Lexer{
  179. input: input,
  180. line: 1,
  181. column: 1,
  182. tokens: make([]Token, 0),
  183. }
  184. }
  185. // Tokenize performs lexical analysis and returns all tokens
  186. func (l *Lexer) Tokenize() ([]Token, error) {
  187. for l.position < len(l.input) {
  188. if err := l.nextToken(); err != nil {
  189. return nil, err
  190. }
  191. }
  192. // Add EOF token
  193. l.addToken(TokenEOF, "")
  194. return l.tokens, nil
  195. }
  196. // nextToken processes the next token from input
  197. func (l *Lexer) nextToken() error {
  198. if l.position >= len(l.input) {
  199. return nil
  200. }
  201. ch := l.current()
  202. // Skip whitespace but track newlines
  203. if unicode.IsSpace(ch) {
  204. return l.consumeWhitespace()
  205. }
  206. // Comments - following mermaid.js pattern
  207. if ch == '%' && l.peek() == '%' {
  208. return l.consumeComment()
  209. }
  210. // Multi-character operators first (order matters!)
  211. if multiChar := l.tryMultiCharOperator(); multiChar != TokenError {
  212. return nil
  213. }
  214. // Keywords and identifiers
  215. if unicode.IsLetter(ch) || ch == '_' {
  216. return l.consumeIdentifier()
  217. }
  218. // Numbers
  219. if unicode.IsDigit(ch) {
  220. return l.consumeNumber()
  221. }
  222. // Strings
  223. if ch == '"' {
  224. return l.consumeString()
  225. }
  226. // Single character tokens
  227. return l.consumeSingleChar()
  228. }
  229. // tryMultiCharOperator attempts to match multi-character operators
  230. func (l *Lexer) tryMultiCharOperator() TokenType {
  231. // Check for ER diagram relationships first (need to be before shorter patterns)
  232. if l.matchString("||--o{") {
  233. l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6)
  234. return TokenEROneToMany
  235. }
  236. if l.matchString("}o--||") {
  237. l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6)
  238. return TokenERManyToOne
  239. }
  240. if l.matchString("||--||") {
  241. l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6)
  242. return TokenEROneToOne
  243. }
  244. if l.matchString("}o--o{") {
  245. l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6)
  246. return TokenERManyToMany
  247. }
  248. if l.matchString("||--o|") {
  249. l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6)
  250. return TokenERZeroToOne
  251. }
  252. // Check for sequence diagram arrows
  253. if l.matchString("->>") {
  254. l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
  255. return TokenArrowSolid
  256. }
  257. if l.matchString("-->>") {
  258. l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4)
  259. return TokenArrowDotted
  260. }
  261. // Check for arrows - based on destructLink patterns
  262. if l.matchString("==>") {
  263. l.addTokenAndAdvance(TokenArrowThick, "==>", 3)
  264. return TokenArrowThick
  265. }
  266. if l.matchString("-->") {
  267. l.addTokenAndAdvance(TokenArrowSolid, "-->", 3)
  268. return TokenArrowSolid
  269. }
  270. if l.matchString("-.->") {
  271. l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4)
  272. return TokenArrowDotted
  273. }
  274. if l.matchString("--x") {
  275. l.addTokenAndAdvance(TokenArrowCross, "--x", 3)
  276. return TokenArrowCross
  277. }
  278. if l.matchString("--o") {
  279. l.addTokenAndAdvance(TokenArrowCircle, "--o", 3)
  280. return TokenArrowCircle
  281. }
  282. if l.matchString("---") {
  283. l.addTokenAndAdvance(TokenArrowOpen, "---", 3)
  284. return TokenArrowOpen
  285. }
  286. if l.matchString("((") {
  287. l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2)
  288. return TokenOpenDoubleParen
  289. }
  290. if l.matchString("))") {
  291. l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2)
  292. return TokenCloseDoubleParen
  293. }
  294. return TokenError
  295. }
  296. // consumeWhitespace consumes whitespace characters
  297. func (l *Lexer) consumeWhitespace() error {
  298. start := l.position
  299. for l.position < len(l.input) && unicode.IsSpace(l.current()) {
  300. if l.current() == '\n' {
  301. l.line++
  302. l.column = 1
  303. l.position++
  304. // Add newline token for significant newlines
  305. if start < l.position-1 {
  306. l.addToken(TokenSpace, l.input[start:l.position-1])
  307. }
  308. l.addToken(TokenNewline, "\n")
  309. return nil
  310. } else {
  311. l.advance()
  312. }
  313. }
  314. if start < l.position {
  315. l.addToken(TokenSpace, l.input[start:l.position])
  316. }
  317. return nil
  318. }
  319. // consumeComment consumes a comment line
  320. func (l *Lexer) consumeComment() error {
  321. start := l.position
  322. // Skip %%
  323. l.advance()
  324. l.advance()
  325. // Read until end of line
  326. for l.position < len(l.input) && l.current() != '\n' {
  327. l.advance()
  328. }
  329. l.addToken(TokenComment, l.input[start:l.position])
  330. return nil
  331. }
  332. // consumeIdentifier consumes identifiers and keywords
  333. func (l *Lexer) consumeIdentifier() error {
  334. start := l.position
  335. // First character already validated
  336. l.advance()
  337. // Continue with alphanumeric and underscore
  338. for l.position < len(l.input) {
  339. ch := l.current()
  340. if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' {
  341. l.advance()
  342. } else {
  343. break
  344. }
  345. }
  346. value := l.input[start:l.position]
  347. tokenType := l.getKeywordType(value)
  348. l.addToken(tokenType, value)
  349. return nil
  350. }
  351. // getKeywordType returns the token type for keywords, or TokenID for identifiers
  352. func (l *Lexer) getKeywordType(value string) TokenType {
  353. // Keywords from flow.jison
  354. switch strings.ToLower(value) {
  355. case "graph":
  356. return TokenGraph
  357. case "flowchart":
  358. return TokenGraph // flowchart uses same token as graph
  359. case "subgraph":
  360. return TokenSubgraph
  361. case "end":
  362. return TokenEnd
  363. case "class":
  364. return TokenClass
  365. case "classdef":
  366. return TokenClassDef
  367. case "click":
  368. return TokenClick
  369. case "style":
  370. return TokenStyle
  371. case "linkstyle":
  372. return TokenLinkStyle
  373. case "default":
  374. return TokenDefault
  375. // Direction keywords
  376. case "td":
  377. return TokenTD
  378. case "tb":
  379. return TokenTB
  380. case "bt":
  381. return TokenBT
  382. case "rl":
  383. return TokenRL
  384. case "lr":
  385. return TokenLR
  386. default:
  387. return TokenID
  388. }
  389. }
  390. // consumeNumber consumes numeric literals
  391. func (l *Lexer) consumeNumber() error {
  392. start := l.position
  393. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  394. l.advance()
  395. }
  396. // Handle decimal point
  397. if l.position < len(l.input) && l.current() == '.' {
  398. l.advance()
  399. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  400. l.advance()
  401. }
  402. }
  403. l.addToken(TokenNumber, l.input[start:l.position])
  404. return nil
  405. }
  406. // consumeString consumes quoted string literals
  407. func (l *Lexer) consumeString() error {
  408. start := l.position
  409. // Skip opening quote
  410. l.advance()
  411. for l.position < len(l.input) && l.current() != '"' {
  412. if l.current() == '\\' && l.position+1 < len(l.input) {
  413. // Skip escaped character
  414. l.advance()
  415. l.advance()
  416. } else {
  417. l.advance()
  418. }
  419. }
  420. if l.position >= len(l.input) {
  421. return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column)
  422. }
  423. // Skip closing quote
  424. l.advance()
  425. l.addToken(TokenString, l.input[start:l.position])
  426. return nil
  427. }
  428. // consumeSingleChar handles single character tokens
  429. func (l *Lexer) consumeSingleChar() error {
  430. ch := l.current()
  431. var tokenType TokenType
  432. switch ch {
  433. case '[':
  434. tokenType = TokenOpenBracket
  435. case ']':
  436. tokenType = TokenCloseBracket
  437. case '(':
  438. tokenType = TokenOpenParen
  439. case ')':
  440. tokenType = TokenCloseParen
  441. case '{':
  442. tokenType = TokenOpenBrace
  443. case '}':
  444. tokenType = TokenCloseBrace
  445. case '<':
  446. tokenType = TokenOpenAngle
  447. case '>':
  448. tokenType = TokenCloseAngle
  449. case '|':
  450. tokenType = TokenPipe
  451. case ':':
  452. tokenType = TokenColon
  453. case ';':
  454. tokenType = TokenSemicolon
  455. case ',':
  456. tokenType = TokenComma
  457. case '&':
  458. tokenType = TokenAmpersand
  459. case '*':
  460. tokenType = TokenMult
  461. case '+':
  462. tokenType = TokenPlus
  463. case '-':
  464. tokenType = TokenMinus
  465. case '=':
  466. tokenType = TokenEquals
  467. case '.':
  468. tokenType = TokenDot
  469. case '!':
  470. tokenType = TokenExclamation
  471. case '/':
  472. tokenType = TokenSlash
  473. case '\\':
  474. tokenType = TokenBackslash
  475. case '#':
  476. tokenType = TokenHash
  477. case '@':
  478. tokenType = TokenAt
  479. case '%':
  480. tokenType = TokenPercent
  481. case '~':
  482. tokenType = TokenTilde
  483. case '?':
  484. tokenType = TokenQuestion
  485. default:
  486. return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column)
  487. }
  488. l.addTokenAndAdvance(tokenType, string(ch), 1)
  489. return nil
  490. }
  491. // Helper methods
  492. // current returns the current character
  493. func (l *Lexer) current() rune {
  494. if l.position >= len(l.input) {
  495. return 0
  496. }
  497. return rune(l.input[l.position])
  498. }
  499. // peek returns the next character without advancing
  500. func (l *Lexer) peek() rune {
  501. if l.position+1 >= len(l.input) {
  502. return 0
  503. }
  504. return rune(l.input[l.position+1])
  505. }
  506. // advance moves to the next character
  507. func (l *Lexer) advance() {
  508. if l.position < len(l.input) {
  509. if l.input[l.position] == '\n' {
  510. l.line++
  511. l.column = 1
  512. } else {
  513. l.column++
  514. }
  515. l.position++
  516. }
  517. }
  518. // matchString checks if the input matches the given string at current position
  519. func (l *Lexer) matchString(s string) bool {
  520. if l.position+len(s) > len(l.input) {
  521. return false
  522. }
  523. return l.input[l.position:l.position+len(s)] == s
  524. }
  525. // addToken adds a token to the token list
  526. func (l *Lexer) addToken(tokenType TokenType, value string) {
  527. token := Token{
  528. Type: tokenType,
  529. Value: value,
  530. Line: l.line,
  531. Column: l.column - len(value),
  532. Position: l.position - len(value),
  533. }
  534. l.tokens = append(l.tokens, token)
  535. }
  536. // addTokenAndAdvance adds a token and advances position
  537. func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, length int) {
  538. l.addToken(tokenType, value)
  539. for i := 0; i < length; i++ {
  540. l.advance()
  541. }
  542. }
  543. // FilterTokens removes whitespace and comment tokens for parsing
  544. func FilterTokens(tokens []Token) []Token {
  545. filtered := make([]Token, 0, len(tokens))
  546. for _, token := range tokens {
  547. if token.Type != TokenSpace && token.Type != TokenComment {
  548. filtered = append(filtered, token)
  549. }
  550. }
  551. return filtered
  552. }