lexer.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586
  1. // Package lexer provides lexical analysis for Mermaid diagram syntax.
  2. // Based on the lexical rules from flow.jison in mermaid.js
  3. package lexer
  4. import (
  5. "fmt"
  6. _ "regexp"
  7. "strings"
  8. "unicode"
  9. )
  10. // TokenType represents the type of a lexical token
  11. type TokenType int
  12. const (
  13. // Special tokens
  14. TokenEOF TokenType = iota
  15. TokenNewline
  16. TokenSpace
  17. TokenComment
  18. // Keywords - from flow.jison
  19. TokenGraph
  20. TokenSubgraph
  21. TokenEnd
  22. TokenDirection
  23. TokenClass
  24. TokenClassDef
  25. TokenClick
  26. TokenStyle
  27. TokenLinkStyle
  28. TokenDefault
  29. // Directions
  30. TokenTD // Top Down
  31. TokenTB // Top Bottom
  32. TokenBT // Bottom Top
  33. TokenRL // Right Left
  34. TokenLR // Left Right
  35. // Identifiers and literals
  36. TokenID
  37. TokenString
  38. TokenNodeString
  39. TokenNumber
  40. TokenUnicodeText
  41. // Shape delimiters - following JISON patterns
  42. TokenOpenBracket // [
  43. TokenCloseBracket // ]
  44. TokenOpenParen // (
  45. TokenCloseParen // )
  46. TokenOpenBrace // {
  47. TokenCloseBrace // }
  48. TokenOpenDoubleParen // ((
  49. TokenCloseDoubleParen // ))
  50. TokenOpenAngle // <
  51. TokenCloseAngle // >
  52. // Edge tokens - from destructLink logic in flowDb.ts
  53. TokenArrowSolid // -->
  54. TokenArrowDotted // -.->
  55. TokenArrowThick // ==>
  56. TokenArrowOpen // ---
  57. TokenArrowPoint // -->
  58. TokenArrowCross // --x
  59. TokenArrowCircle // --o
  60. // ER diagram relationship tokens
  61. TokenEROneToMany // ||--o{
  62. TokenERManyToOne // }o--||
  63. TokenEROneToOne // ||--||
  64. TokenERManyToMany // }o--o{
  65. TokenERZeroToOne // ||--o|
  66. // Edge modifiers
  67. TokenPipe // |
  68. TokenColon // :
  69. TokenSemicolon // ;
  70. TokenComma // ,
  71. TokenAmpersand // &
  72. TokenMult // *
  73. TokenPlus // +
  74. TokenMinus // -
  75. TokenEquals // =
  76. TokenDot // .
  77. TokenExclamation // !
  78. // Error token
  79. TokenError
  80. )
  81. // Token represents a lexical token
  82. type Token struct {
  83. Type TokenType
  84. Value string
  85. Line int
  86. Column int
  87. Position int
  88. }
  89. // String returns a string representation of the token
  90. func (t Token) String() string {
  91. return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}",
  92. tokenTypeNames[t.Type], t.Value, t.Line, t.Column)
  93. }
  94. var tokenTypeNames = map[TokenType]string{
  95. TokenEOF: "EOF",
  96. TokenNewline: "NEWLINE",
  97. TokenSpace: "SPACE",
  98. TokenComment: "COMMENT",
  99. TokenGraph: "GRAPH",
  100. TokenSubgraph: "SUBGRAPH",
  101. TokenEnd: "END",
  102. TokenDirection: "DIRECTION",
  103. TokenClass: "CLASS",
  104. TokenClassDef: "CLASSDEF",
  105. TokenClick: "CLICK",
  106. TokenStyle: "STYLE",
  107. TokenLinkStyle: "LINKSTYLE",
  108. TokenDefault: "DEFAULT",
  109. TokenTD: "TD",
  110. TokenTB: "TB",
  111. TokenBT: "BT",
  112. TokenRL: "RL",
  113. TokenLR: "LR",
  114. TokenID: "ID",
  115. TokenString: "STRING",
  116. TokenNodeString: "NODE_STRING",
  117. TokenNumber: "NUMBER",
  118. TokenUnicodeText: "UNICODE_TEXT",
  119. TokenOpenBracket: "OPEN_BRACKET",
  120. TokenCloseBracket: "CLOSE_BRACKET",
  121. TokenOpenParen: "OPEN_PAREN",
  122. TokenCloseParen: "CLOSE_PAREN",
  123. TokenOpenBrace: "OPEN_BRACE",
  124. TokenCloseBrace: "CLOSE_BRACE",
  125. TokenOpenDoubleParen: "OPEN_DOUBLE_PAREN",
  126. TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN",
  127. TokenOpenAngle: "OPEN_ANGLE",
  128. TokenCloseAngle: "CLOSE_ANGLE",
  129. TokenArrowSolid: "ARROW_SOLID",
  130. TokenArrowDotted: "ARROW_DOTTED",
  131. TokenArrowThick: "ARROW_THICK",
  132. TokenArrowOpen: "ARROW_OPEN",
  133. TokenArrowPoint: "ARROW_POINT",
  134. TokenArrowCross: "ARROW_CROSS",
  135. TokenArrowCircle: "ARROW_CIRCLE",
  136. TokenEROneToMany: "ER_ONE_TO_MANY",
  137. TokenERManyToOne: "ER_MANY_TO_ONE",
  138. TokenEROneToOne: "ER_ONE_TO_ONE",
  139. TokenERManyToMany: "ER_MANY_TO_MANY",
  140. TokenERZeroToOne: "ER_ZERO_TO_ONE",
  141. TokenPipe: "PIPE",
  142. TokenColon: "COLON",
  143. TokenSemicolon: "SEMICOLON",
  144. TokenComma: "COMMA",
  145. TokenAmpersand: "AMPERSAND",
  146. TokenMult: "MULT",
  147. TokenMinus: "MINUS",
  148. TokenEquals: "EQUALS",
  149. TokenDot: "DOT",
  150. TokenError: "ERROR",
  151. }
  152. // Lexer performs lexical analysis on mermaid input
  153. type Lexer struct {
  154. input string
  155. position int
  156. line int
  157. column int
  158. tokens []Token
  159. }
  160. // NewLexer creates a new lexer for the given input
  161. func NewLexer(input string) *Lexer {
  162. return &Lexer{
  163. input: input,
  164. line: 1,
  165. column: 1,
  166. tokens: make([]Token, 0),
  167. }
  168. }
  169. // Tokenize performs lexical analysis and returns all tokens
  170. func (l *Lexer) Tokenize() ([]Token, error) {
  171. for l.position < len(l.input) {
  172. if err := l.nextToken(); err != nil {
  173. return nil, err
  174. }
  175. }
  176. // Add EOF token
  177. l.addToken(TokenEOF, "")
  178. return l.tokens, nil
  179. }
  180. // nextToken processes the next token from input
  181. func (l *Lexer) nextToken() error {
  182. if l.position >= len(l.input) {
  183. return nil
  184. }
  185. ch := l.current()
  186. // Skip whitespace but track newlines
  187. if unicode.IsSpace(ch) {
  188. return l.consumeWhitespace()
  189. }
  190. // Comments - following mermaid.js pattern
  191. if ch == '%' && l.peek() == '%' {
  192. return l.consumeComment()
  193. }
  194. // Multi-character operators first (order matters!)
  195. if multiChar := l.tryMultiCharOperator(); multiChar != TokenError {
  196. return nil
  197. }
  198. // Keywords and identifiers
  199. if unicode.IsLetter(ch) || ch == '_' {
  200. return l.consumeIdentifier()
  201. }
  202. // Numbers
  203. if unicode.IsDigit(ch) {
  204. return l.consumeNumber()
  205. }
  206. // Strings
  207. if ch == '"' {
  208. return l.consumeString()
  209. }
  210. // Single character tokens
  211. return l.consumeSingleChar()
  212. }
  213. // tryMultiCharOperator attempts to match multi-character operators
  214. func (l *Lexer) tryMultiCharOperator() TokenType {
  215. // Check for ER diagram relationships first (need to be before shorter patterns)
  216. if l.matchString("||--o{") {
  217. l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6)
  218. return TokenEROneToMany
  219. }
  220. if l.matchString("}o--||") {
  221. l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6)
  222. return TokenERManyToOne
  223. }
  224. if l.matchString("||--||") {
  225. l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6)
  226. return TokenEROneToOne
  227. }
  228. if l.matchString("}o--o{") {
  229. l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6)
  230. return TokenERManyToMany
  231. }
  232. if l.matchString("||--o|") {
  233. l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6)
  234. return TokenERZeroToOne
  235. }
  236. // Check for sequence diagram arrows
  237. if l.matchString("->>") {
  238. l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
  239. return TokenArrowSolid
  240. }
  241. if l.matchString("-->>") {
  242. l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4)
  243. return TokenArrowDotted
  244. }
  245. // Check for arrows - based on destructLink patterns
  246. if l.matchString("==>") {
  247. l.addTokenAndAdvance(TokenArrowThick, "==>", 3)
  248. return TokenArrowThick
  249. }
  250. if l.matchString("-->") {
  251. l.addTokenAndAdvance(TokenArrowSolid, "-->", 3)
  252. return TokenArrowSolid
  253. }
  254. if l.matchString("-.->") {
  255. l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4)
  256. return TokenArrowDotted
  257. }
  258. if l.matchString("--x") {
  259. l.addTokenAndAdvance(TokenArrowCross, "--x", 3)
  260. return TokenArrowCross
  261. }
  262. if l.matchString("--o") {
  263. l.addTokenAndAdvance(TokenArrowCircle, "--o", 3)
  264. return TokenArrowCircle
  265. }
  266. if l.matchString("---") {
  267. l.addTokenAndAdvance(TokenArrowOpen, "---", 3)
  268. return TokenArrowOpen
  269. }
  270. if l.matchString("((") {
  271. l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2)
  272. return TokenOpenDoubleParen
  273. }
  274. if l.matchString("))") {
  275. l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2)
  276. return TokenCloseDoubleParen
  277. }
  278. return TokenError
  279. }
  280. // consumeWhitespace consumes whitespace characters
  281. func (l *Lexer) consumeWhitespace() error {
  282. start := l.position
  283. for l.position < len(l.input) && unicode.IsSpace(l.current()) {
  284. if l.current() == '\n' {
  285. l.line++
  286. l.column = 1
  287. l.position++
  288. // Add newline token for significant newlines
  289. if start < l.position-1 {
  290. l.addToken(TokenSpace, l.input[start:l.position-1])
  291. }
  292. l.addToken(TokenNewline, "\n")
  293. return nil
  294. } else {
  295. l.advance()
  296. }
  297. }
  298. if start < l.position {
  299. l.addToken(TokenSpace, l.input[start:l.position])
  300. }
  301. return nil
  302. }
  303. // consumeComment consumes a comment line
  304. func (l *Lexer) consumeComment() error {
  305. start := l.position
  306. // Skip %%
  307. l.advance()
  308. l.advance()
  309. // Read until end of line
  310. for l.position < len(l.input) && l.current() != '\n' {
  311. l.advance()
  312. }
  313. l.addToken(TokenComment, l.input[start:l.position])
  314. return nil
  315. }
  316. // consumeIdentifier consumes identifiers and keywords
  317. func (l *Lexer) consumeIdentifier() error {
  318. start := l.position
  319. // First character already validated
  320. l.advance()
  321. // Continue with alphanumeric and underscore
  322. for l.position < len(l.input) {
  323. ch := l.current()
  324. if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' {
  325. l.advance()
  326. } else {
  327. break
  328. }
  329. }
  330. value := l.input[start:l.position]
  331. tokenType := l.getKeywordType(value)
  332. l.addToken(tokenType, value)
  333. return nil
  334. }
  335. // getKeywordType returns the token type for keywords, or TokenID for identifiers
  336. func (l *Lexer) getKeywordType(value string) TokenType {
  337. // Keywords from flow.jison
  338. switch strings.ToLower(value) {
  339. case "graph":
  340. return TokenGraph
  341. case "flowchart":
  342. return TokenGraph // flowchart uses same token as graph
  343. case "subgraph":
  344. return TokenSubgraph
  345. case "end":
  346. return TokenEnd
  347. case "class":
  348. return TokenClass
  349. case "classdef":
  350. return TokenClassDef
  351. case "click":
  352. return TokenClick
  353. case "style":
  354. return TokenStyle
  355. case "linkstyle":
  356. return TokenLinkStyle
  357. case "default":
  358. return TokenDefault
  359. // Direction keywords
  360. case "td":
  361. return TokenTD
  362. case "tb":
  363. return TokenTB
  364. case "bt":
  365. return TokenBT
  366. case "rl":
  367. return TokenRL
  368. case "lr":
  369. return TokenLR
  370. default:
  371. return TokenID
  372. }
  373. }
  374. // consumeNumber consumes numeric literals
  375. func (l *Lexer) consumeNumber() error {
  376. start := l.position
  377. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  378. l.advance()
  379. }
  380. // Handle decimal point
  381. if l.position < len(l.input) && l.current() == '.' {
  382. l.advance()
  383. for l.position < len(l.input) && unicode.IsDigit(l.current()) {
  384. l.advance()
  385. }
  386. }
  387. l.addToken(TokenNumber, l.input[start:l.position])
  388. return nil
  389. }
  390. // consumeString consumes quoted string literals
  391. func (l *Lexer) consumeString() error {
  392. start := l.position
  393. // Skip opening quote
  394. l.advance()
  395. for l.position < len(l.input) && l.current() != '"' {
  396. if l.current() == '\\' && l.position+1 < len(l.input) {
  397. // Skip escaped character
  398. l.advance()
  399. l.advance()
  400. } else {
  401. l.advance()
  402. }
  403. }
  404. if l.position >= len(l.input) {
  405. return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column)
  406. }
  407. // Skip closing quote
  408. l.advance()
  409. l.addToken(TokenString, l.input[start:l.position])
  410. return nil
  411. }
  412. // consumeSingleChar handles single character tokens
  413. func (l *Lexer) consumeSingleChar() error {
  414. ch := l.current()
  415. var tokenType TokenType
  416. switch ch {
  417. case '[':
  418. tokenType = TokenOpenBracket
  419. case ']':
  420. tokenType = TokenCloseBracket
  421. case '(':
  422. tokenType = TokenOpenParen
  423. case ')':
  424. tokenType = TokenCloseParen
  425. case '{':
  426. tokenType = TokenOpenBrace
  427. case '}':
  428. tokenType = TokenCloseBrace
  429. case '<':
  430. tokenType = TokenOpenAngle
  431. case '>':
  432. tokenType = TokenCloseAngle
  433. case '|':
  434. tokenType = TokenPipe
  435. case ':':
  436. tokenType = TokenColon
  437. case ';':
  438. tokenType = TokenSemicolon
  439. case ',':
  440. tokenType = TokenComma
  441. case '&':
  442. tokenType = TokenAmpersand
  443. case '*':
  444. tokenType = TokenMult
  445. case '+':
  446. tokenType = TokenPlus
  447. case '-':
  448. tokenType = TokenMinus
  449. case '=':
  450. tokenType = TokenEquals
  451. case '.':
  452. tokenType = TokenDot
  453. case '!':
  454. tokenType = TokenExclamation
  455. default:
  456. return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column)
  457. }
  458. l.addTokenAndAdvance(tokenType, string(ch), 1)
  459. return nil
  460. }
  461. // Helper methods
  462. // current returns the current character
  463. func (l *Lexer) current() rune {
  464. if l.position >= len(l.input) {
  465. return 0
  466. }
  467. return rune(l.input[l.position])
  468. }
  469. // peek returns the next character without advancing
  470. func (l *Lexer) peek() rune {
  471. if l.position+1 >= len(l.input) {
  472. return 0
  473. }
  474. return rune(l.input[l.position+1])
  475. }
  476. // advance moves to the next character
  477. func (l *Lexer) advance() {
  478. if l.position < len(l.input) {
  479. if l.input[l.position] == '\n' {
  480. l.line++
  481. l.column = 1
  482. } else {
  483. l.column++
  484. }
  485. l.position++
  486. }
  487. }
  488. // matchString checks if the input matches the given string at current position
  489. func (l *Lexer) matchString(s string) bool {
  490. if l.position+len(s) > len(l.input) {
  491. return false
  492. }
  493. return l.input[l.position:l.position+len(s)] == s
  494. }
  495. // addToken adds a token to the token list
  496. func (l *Lexer) addToken(tokenType TokenType, value string) {
  497. token := Token{
  498. Type: tokenType,
  499. Value: value,
  500. Line: l.line,
  501. Column: l.column - len(value),
  502. Position: l.position - len(value),
  503. }
  504. l.tokens = append(l.tokens, token)
  505. }
  506. // addTokenAndAdvance adds a token and advances position
  507. func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, length int) {
  508. l.addToken(tokenType, value)
  509. for i := 0; i < length; i++ {
  510. l.advance()
  511. }
  512. }
  513. // FilterTokens removes whitespace and comment tokens for parsing
  514. func FilterTokens(tokens []Token) []Token {
  515. filtered := make([]Token, 0, len(tokens))
  516. for _, token := range tokens {
  517. if token.Type != TokenSpace && token.Type != TokenComment {
  518. filtered = append(filtered, token)
  519. }
  520. }
  521. return filtered
  522. }