// Package lexer provides lexical analysis for Mermaid diagram syntax. // Based on the lexical rules from flow.jison in mermaid.js package lexer import ( "fmt" _ "regexp" "strings" "unicode" "unicode/utf8" ) // TokenType represents the type of a lexical token type TokenType int const ( // Special tokens TokenEOF TokenType = iota TokenNewline TokenSpace TokenComment // Keywords - from flow.jison TokenGraph TokenSubgraph TokenEnd TokenDirection TokenClass TokenClassDef TokenClick TokenStyle TokenLinkStyle TokenDefault // Directions TokenTD // Top Down TokenTB // Top Bottom TokenBT // Bottom Top TokenRL // Right Left TokenLR // Left Right // State diagram action keywords TokenEntry // entry TokenExit // exit TokenDo // do // Identifiers and literals TokenID TokenString TokenNodeString TokenNumber TokenUnicodeText // Shape delimiters - following JISON patterns TokenOpenBracket // [ TokenCloseBracket // ] TokenOpenParen // ( TokenCloseParen // ) TokenOpenBrace // { TokenCloseBrace // } TokenOpenDoubleParen // (( TokenCloseDoubleParen // )) TokenOpenAngle // < TokenCloseAngle // > // Edge tokens - from destructLink logic in flowDb.ts TokenArrowSolid // --> TokenArrowDotted // -.-> TokenArrowThick // ==> TokenArrowOpen // --- TokenArrowPoint // --> TokenArrowCross // --x TokenArrowCircle // --o // ER diagram relationship tokens TokenEROneToMany // ||--o{ TokenEROneToManyAlt // ||--|{ TokenERManyToOne // }o--|| TokenEROneToOne // ||--|| TokenERManyToMany // }o--o{ TokenERManyToManyAlt // }|..|{ TokenERZeroToOne // ||--o| // Class diagram relationship tokens TokenClassInheritance // <|-- TokenClassComposition // *-- TokenClassAggregation // o-- TokenClassAssociation // --> TokenClassRealization // ..|> TokenClassDependency // ..> // ER diagram cardinality tokens TokenERCardOnlyOne // || TokenERCardZeroOrOne // o TokenERCardZeroOrMore // o{ TokenERCardOneOrMore // |{ TokenERCardCloseOne // }| TokenERCardCloseZero // }o // Edge modifiers TokenPipe // | TokenColon // : TokenSemicolon // ; TokenComma // , TokenAmpersand // & TokenMult // * TokenPlus // + TokenMinus // - TokenEquals // = TokenDot // . TokenExclamation // ! TokenSlash // / TokenBackslash // \ TokenHash // # TokenAt // @ TokenPercent // % TokenTilde // ~ TokenQuestion // ? // Error token TokenError ) // Token represents a lexical token type Token struct { Type TokenType Value string Line int Column int Position int } // String returns a string representation of the token func (t Token) String() string { return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}", t.Type.String(), t.Value, t.Line, t.Column) } // String returns the string representation of TokenType func (tt TokenType) String() string { if name, exists := tokenTypeNames[tt]; exists { return name } return fmt.Sprintf("TokenType(%d)", int(tt)) } var tokenTypeNames = map[TokenType]string{ TokenEOF: "EOF", TokenNewline: "NEWLINE", TokenSpace: "SPACE", TokenComment: "COMMENT", TokenGraph: "GRAPH", TokenSubgraph: "SUBGRAPH", TokenEnd: "END", TokenDirection: "DIRECTION", TokenClass: "CLASS", TokenClassDef: "CLASSDEF", TokenClick: "CLICK", TokenStyle: "STYLE", TokenLinkStyle: "LINKSTYLE", TokenDefault: "DEFAULT", TokenTD: "TD", TokenTB: "TB", TokenBT: "BT", TokenRL: "RL", TokenLR: "LR", TokenEntry: "ENTRY", TokenExit: "EXIT", TokenDo: "DO", TokenID: "ID", TokenString: "STRING", TokenNodeString: "NODE_STRING", TokenNumber: "NUMBER", TokenUnicodeText: "UNICODE_TEXT", TokenOpenBracket: "OPEN_BRACKET", TokenCloseBracket: "CLOSE_BRACKET", TokenOpenParen: "OPEN_PAREN", TokenCloseParen: "CLOSE_PAREN", TokenOpenBrace: "OPEN_BRACE", TokenCloseBrace: "CLOSE_BRACE", TokenOpenDoubleParen: "OPEN_DOUBLE_PAREN", TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN", TokenOpenAngle: "OPEN_ANGLE", TokenCloseAngle: "CLOSE_ANGLE", TokenArrowSolid: "ARROW_SOLID", TokenArrowDotted: "ARROW_DOTTED", TokenArrowThick: "ARROW_THICK", TokenArrowOpen: "ARROW_OPEN", TokenArrowPoint: "ARROW_POINT", TokenArrowCross: "ARROW_CROSS", TokenArrowCircle: "ARROW_CIRCLE", TokenEROneToMany: "ER_ONE_TO_MANY", TokenEROneToManyAlt: "ER_ONE_TO_MANY_ALT", TokenERManyToOne: "ER_MANY_TO_ONE", TokenEROneToOne: "ER_ONE_TO_ONE", TokenERManyToMany: "ER_MANY_TO_MANY", TokenERManyToManyAlt: "ER_MANY_TO_MANY_ALT", TokenERZeroToOne: "ER_ZERO_TO_ONE", TokenClassInheritance: "CLASS_INHERITANCE", TokenClassComposition: "CLASS_COMPOSITION", TokenClassAggregation: "CLASS_AGGREGATION", TokenClassAssociation: "CLASS_ASSOCIATION", TokenClassRealization: "CLASS_REALIZATION", TokenClassDependency: "CLASS_DEPENDENCY", TokenERCardOnlyOne: "ER_CARD_ONLY_ONE", TokenERCardZeroOrOne: "ER_CARD_ZERO_OR_ONE", TokenERCardZeroOrMore: "ER_CARD_ZERO_OR_MORE", TokenERCardOneOrMore: "ER_CARD_ONE_OR_MORE", TokenERCardCloseOne: "ER_CARD_CLOSE_ONE", TokenERCardCloseZero: "ER_CARD_CLOSE_ZERO", TokenPipe: "PIPE", TokenColon: "COLON", TokenSemicolon: "SEMICOLON", TokenComma: "COMMA", TokenAmpersand: "AMPERSAND", TokenMult: "MULT", TokenPlus: "PLUS", TokenMinus: "MINUS", TokenEquals: "EQUALS", TokenDot: "DOT", TokenExclamation: "EXCLAMATION", TokenSlash: "SLASH", TokenBackslash: "BACKSLASH", TokenHash: "HASH", TokenAt: "AT", TokenPercent: "PERCENT", TokenTilde: "TILDE", TokenQuestion: "QUESTION", TokenError: "ERROR", } // Lexer performs lexical analysis on mermaid input type Lexer struct { input string position int line int column int tokens []Token } // NewLexer creates a new lexer for the given input func NewLexer(input string) *Lexer { return &Lexer{ input: input, line: 1, column: 1, tokens: make([]Token, 0), } } // Tokenize performs lexical analysis and returns all tokens func (l *Lexer) Tokenize() ([]Token, error) { for l.position < len(l.input) { if err := l.nextToken(); err != nil { return nil, err } } // Add EOF token l.addToken(TokenEOF, "") return l.tokens, nil } // nextToken processes the next token from input func (l *Lexer) nextToken() error { if l.position >= len(l.input) { return nil } ch := l.current() // Skip whitespace but track newlines if unicode.IsSpace(ch) { return l.consumeWhitespace() } // Comments - following mermaid.js pattern if ch == '%' && l.peek() == '%' { return l.consumeComment() } // Multi-character operators first (order matters!) if multiChar := l.tryMultiCharOperator(); multiChar != TokenError { return nil } // Keywords and identifiers - support Unicode letters including Chinese if l.isUnicodeIdentifierStart(ch) { return l.consumeIdentifier() } // Numbers if unicode.IsDigit(ch) { return l.consumeNumber() } // Strings if ch == '"' { return l.consumeString() } // Single character tokens return l.consumeSingleChar() } // tryMultiCharOperator attempts to match multi-character operators func (l *Lexer) tryMultiCharOperator() TokenType { // Check for state diagram special markers first if l.matchString("<>") { l.addTokenAndAdvance(TokenID, "<>", 8) return TokenID } if l.matchString("<>") { l.addTokenAndAdvance(TokenID, "<>", 8) return TokenID } if l.matchString("<>") { l.addTokenAndAdvance(TokenID, "<>", 10) return TokenID } if l.matchString("<>") { l.addTokenAndAdvance(TokenID, "<>", 11) return TokenID } if l.matchString("<>") { l.addTokenAndAdvance(TokenID, "<>", 15) return TokenID } // Check for ER diagram relationships first (need to be before shorter patterns) if l.matchString("||--o{") { l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6) return TokenEROneToMany } if l.matchString("}o--||") { l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6) return TokenERManyToOne } if l.matchString("||--||") { l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6) return TokenEROneToOne } if l.matchString("}o--o{") { l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6) return TokenERManyToMany } if l.matchString("||--o|") { l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6) return TokenERZeroToOne } if l.matchString("||--|{") { l.addTokenAndAdvance(TokenEROneToManyAlt, "||--|{", 6) return TokenEROneToManyAlt } if l.matchString("}|..|{") { l.addTokenAndAdvance(TokenERManyToManyAlt, "}|..|{", 6) return TokenERManyToManyAlt } // Check for class diagram relationship symbols if l.matchString("<|--") { l.addTokenAndAdvance(TokenClassInheritance, "<|--", 4) return TokenClassInheritance } if l.matchString("*--") { l.addTokenAndAdvance(TokenClassComposition, "*--", 3) return TokenClassComposition } if l.matchString("o--") { l.addTokenAndAdvance(TokenClassAggregation, "o--", 3) return TokenClassAggregation } if l.matchString("-->") { l.addTokenAndAdvance(TokenClassAssociation, "-->", 3) return TokenClassAssociation } if l.matchString("..|>") { l.addTokenAndAdvance(TokenClassRealization, "..|>", 4) return TokenClassRealization } if l.matchString("..>") { l.addTokenAndAdvance(TokenClassDependency, "..>", 3) return TokenClassDependency } // Check for ER diagram cardinality symbols if l.matchString("||") { l.addTokenAndAdvance(TokenERCardOnlyOne, "||", 2) return TokenERCardOnlyOne } if l.matchString("o{") { l.addTokenAndAdvance(TokenERCardZeroOrMore, "o{", 2) return TokenERCardZeroOrMore } if l.matchString("|{") { l.addTokenAndAdvance(TokenERCardOneOrMore, "|{", 2) return TokenERCardOneOrMore } if l.matchString("}|") { l.addTokenAndAdvance(TokenERCardCloseOne, "}|", 2) return TokenERCardCloseOne } if l.matchString("}o") { l.addTokenAndAdvance(TokenERCardCloseZero, "}o", 2) return TokenERCardCloseZero } // Check for special diagram type declarations first if l.matchString("stateDiagram-v2") { l.addTokenAndAdvance(TokenID, "stateDiagram-v2", 15) return TokenID } if l.matchString("stateDiagram") { l.addTokenAndAdvance(TokenID, "stateDiagram", 12) return TokenID } if l.matchString("sequenceDiagram") { l.addTokenAndAdvance(TokenID, "sequenceDiagram", 15) return TokenID } if l.matchString("erDiagram") { l.addTokenAndAdvance(TokenID, "erDiagram", 9) return TokenID } // Check for sequence diagram arrows (longer patterns first, following mermaidjs patterns) // Bidirectional arrows if l.matchString("<<-->>") { l.addTokenAndAdvance(TokenArrowDotted, "<<-->>", 6) return TokenArrowDotted } if l.matchString("<<->>") { l.addTokenAndAdvance(TokenArrowSolid, "<<->>", 5) return TokenArrowSolid } // Complex arrows with directional markers if l.matchString("--|\\") { l.addTokenAndAdvance(TokenArrowSolid, "--|\\", 4) return TokenArrowSolid } if l.matchString("--|/") { l.addTokenAndAdvance(TokenArrowSolid, "--|/", 4) return TokenArrowSolid } if l.matchString("-|\\") { l.addTokenAndAdvance(TokenArrowSolid, "-|\\", 3) return TokenArrowSolid } if l.matchString("-|/") { l.addTokenAndAdvance(TokenArrowSolid, "-|/", 3) return TokenArrowSolid } // Standard arrows if l.matchString("-->>") { l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4) return TokenArrowDotted } if l.matchString("->>") { l.addTokenAndAdvance(TokenArrowSolid, "->>", 3) return TokenArrowSolid } if l.matchString("<->") { l.addTokenAndAdvance(TokenArrowSolid, "<->", 3) return TokenArrowSolid } if l.matchString("-x") { l.addTokenAndAdvance(TokenArrowCross, "-x", 2) return TokenArrowCross } if l.matchString("--)") { l.addTokenAndAdvance(TokenArrowOpen, "--)", 3) return TokenArrowOpen } if l.matchString("->") { l.addTokenAndAdvance(TokenArrowSolid, "->", 2) return TokenArrowSolid } if l.matchString("-)") { l.addTokenAndAdvance(TokenArrowOpen, "-)", 2) return TokenArrowOpen } // Check for arrows - based on destructLink patterns if l.matchString("==>") { l.addTokenAndAdvance(TokenArrowThick, "==>", 3) return TokenArrowThick } if l.matchString("-->") { l.addTokenAndAdvance(TokenArrowSolid, "-->", 3) return TokenArrowSolid } if l.matchString("-.->") { l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4) return TokenArrowDotted } if l.matchString("--x") { l.addTokenAndAdvance(TokenArrowCross, "--x", 3) return TokenArrowCross } if l.matchString("--o") { l.addTokenAndAdvance(TokenArrowCircle, "--o", 3) return TokenArrowCircle } if l.matchString("---") { l.addTokenAndAdvance(TokenArrowOpen, "---", 3) return TokenArrowOpen } if l.matchString("((") { l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2) return TokenOpenDoubleParen } if l.matchString("))") { l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2) return TokenCloseDoubleParen } // Check for single character ER cardinality symbols if l.matchString("o") && !l.isNextChar('{') && !l.isNextChar('-') && !l.isNextCharLetter() { l.addTokenAndAdvance(TokenERCardZeroOrOne, "o", 1) return TokenERCardZeroOrOne } return TokenError } // consumeWhitespace consumes whitespace characters func (l *Lexer) consumeWhitespace() error { start := l.position for l.position < len(l.input) && unicode.IsSpace(l.current()) { if l.current() == '\n' { l.line++ l.column = 1 l.position++ // Add newline token for significant newlines if start < l.position-1 { l.addToken(TokenSpace, l.input[start:l.position-1]) } l.addToken(TokenNewline, "\n") return nil } else { l.advance() } } if start < l.position { l.addToken(TokenSpace, l.input[start:l.position]) } return nil } // consumeComment consumes a comment line func (l *Lexer) consumeComment() error { start := l.position // Skip %% l.advance() l.advance() // Read until end of line for l.position < len(l.input) && l.current() != '\n' { l.advance() } l.addToken(TokenComment, l.input[start:l.position]) return nil } // consumeIdentifier consumes identifiers and keywords with Unicode support func (l *Lexer) consumeIdentifier() error { start := l.position // First character already validated l.advance() // Continue with Unicode identifier characters for l.position < len(l.input) { ch := l.current() if l.isUnicodeIdentifierChar(ch) { l.advance() } else { break } } value := l.input[start:l.position] tokenType := l.getKeywordType(value) l.addToken(tokenType, value) return nil } // getKeywordType returns the token type for keywords, or TokenID for identifiers func (l *Lexer) getKeywordType(value string) TokenType { // Keywords from flow.jison switch strings.ToLower(value) { case "graph": return TokenGraph case "flowchart": return TokenGraph // flowchart uses same token as graph case "subgraph": return TokenSubgraph case "end": return TokenEnd case "class": return TokenClass case "classdef": return TokenClassDef case "click": return TokenClick case "style": return TokenStyle case "linkstyle": return TokenLinkStyle case "default": return TokenDefault // Direction keywords case "td": return TokenTD case "tb": return TokenTB case "bt": return TokenBT case "rl": return TokenRL case "lr": return TokenLR // State diagram keywords case "state": return TokenID case "diagram": return TokenID case "statediagram-v2": return TokenID case "statediagram": return TokenID case "entry": return TokenEntry case "exit": return TokenExit case "do": return TokenDo case "pk": return TokenID case "fk": return TokenID case "uk": return TokenID default: return TokenID } } // consumeNumber consumes numeric literals func (l *Lexer) consumeNumber() error { start := l.position for l.position < len(l.input) && unicode.IsDigit(l.current()) { l.advance() } // Handle decimal point if l.position < len(l.input) && l.current() == '.' { l.advance() for l.position < len(l.input) && unicode.IsDigit(l.current()) { l.advance() } } // Check for duration suffixes (d, w, m, y, h) if l.position < len(l.input) { ch := l.current() if ch == 'd' || ch == 'w' || ch == 'm' || ch == 'y' || ch == 'h' { l.advance() } } // Check for date format (YYYY-MM-DD) - look for additional numbers with dashes if l.position < len(l.input) && l.current() == '-' { l.advance() // consume the dash // Look for more digits after dash for l.position < len(l.input) && unicode.IsDigit(l.current()) { l.advance() } // Look for another dash and more digits if l.position < len(l.input) && l.current() == '-' { l.advance() // consume the second dash for l.position < len(l.input) && unicode.IsDigit(l.current()) { l.advance() } } } l.addToken(TokenNumber, l.input[start:l.position]) return nil } // consumeString consumes quoted string literals func (l *Lexer) consumeString() error { start := l.position // Skip opening quote l.advance() for l.position < len(l.input) && l.current() != '"' { if l.current() == '\\' && l.position+1 < len(l.input) { // Skip escaped character l.advance() l.advance() } else { l.advance() } } if l.position >= len(l.input) { return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column) } // Skip closing quote l.advance() l.addToken(TokenString, l.input[start:l.position]) return nil } // consumeSingleChar handles single character tokens func (l *Lexer) consumeSingleChar() error { ch := l.current() var tokenType TokenType switch ch { case '[': tokenType = TokenOpenBracket case ']': tokenType = TokenCloseBracket case '(': tokenType = TokenOpenParen case ')': tokenType = TokenCloseParen case '{': tokenType = TokenOpenBrace case '}': tokenType = TokenCloseBrace case '<': tokenType = TokenOpenAngle case '>': tokenType = TokenCloseAngle case '|': tokenType = TokenPipe case ':': tokenType = TokenColon case ';': tokenType = TokenSemicolon case ',': tokenType = TokenComma case '&': tokenType = TokenAmpersand case '*': tokenType = TokenMult case '+': tokenType = TokenPlus case '-': tokenType = TokenMinus case '=': tokenType = TokenEquals case '.': tokenType = TokenDot case '!': tokenType = TokenExclamation case '/': tokenType = TokenSlash case '\\': tokenType = TokenBackslash case '#': tokenType = TokenHash case '@': tokenType = TokenAt case '%': tokenType = TokenPercent case '~': tokenType = TokenTilde case '?': tokenType = TokenQuestion default: return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column) } l.addTokenAndAdvance(tokenType, string(ch), 1) return nil } // Helper methods // current returns the current character func (l *Lexer) current() rune { if l.position >= len(l.input) { return 0 } r, _ := utf8.DecodeRuneInString(l.input[l.position:]) return r } // peek returns the next character without advancing func (l *Lexer) peek() rune { if l.position >= len(l.input) { return 0 } _, size := utf8.DecodeRuneInString(l.input[l.position:]) if l.position+size >= len(l.input) { return 0 } nextR, _ := utf8.DecodeRuneInString(l.input[l.position+size:]) return nextR } // advance moves to the next character func (l *Lexer) advance() { if l.position < len(l.input) { r, size := utf8.DecodeRuneInString(l.input[l.position:]) if r == '\n' { l.line++ l.column = 1 } else { l.column++ } l.position += size } } // matchString checks if the input matches the given string at current position func (l *Lexer) matchString(s string) bool { if l.position+len(s) > len(l.input) { return false } return l.input[l.position:l.position+len(s)] == s } // isNextChar checks if the next character matches the given character func (l *Lexer) isNextChar(ch byte) bool { if l.position+1 >= len(l.input) { return false } return l.input[l.position+1] == ch } // isNextCharLetter checks if the next character is a letter func (l *Lexer) isNextCharLetter() bool { if l.position+1 >= len(l.input) { return false } return unicode.IsLetter(rune(l.input[l.position+1])) } // addToken adds a token to the token list func (l *Lexer) addToken(tokenType TokenType, value string) { runeCount := utf8.RuneCountInString(value) byteCount := len(value) token := Token{ Type: tokenType, Value: value, Line: l.line, Column: l.column - runeCount, Position: l.position - byteCount, } l.tokens = append(l.tokens, token) } // addTokenAndAdvance adds a token and advances position by the specified number of runes func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, runeCount int) { l.addToken(tokenType, value) for i := 0; i < runeCount; i++ { l.advance() } } // isChineseChar checks if a character is a Chinese character func (l *Lexer) isChineseChar(ch rune) bool { // Chinese character ranges in Unicode return (ch >= 0x4e00 && ch <= 0x9fff) || // CJK Unified Ideographs (ch >= 0x3400 && ch <= 0x4dbf) || // CJK Extension A (ch >= 0x20000 && ch <= 0x2a6df) || // CJK Extension B (ch >= 0x2a700 && ch <= 0x2b73f) || // CJK Extension C (ch >= 0x2b740 && ch <= 0x2b81f) || // CJK Extension D (ch >= 0x2b820 && ch <= 0x2ceaf) || // CJK Extension E (ch >= 0x2ceb0 && ch <= 0x2ebef) || // CJK Extension F (ch >= 0x30000 && ch <= 0x3134f) || // CJK Extension G (ch >= 0x3190 && ch <= 0x319f) || // Kanbun (ch >= 0x31c0 && ch <= 0x31ef) || // CJK Strokes (ch >= 0x2e80 && ch <= 0x2eff) || // CJK Radicals Supplement (ch >= 0x2f00 && ch <= 0x2fdf) || // Kangxi Radicals (ch >= 0x2ff0 && ch <= 0x2fff) || // Ideographic Description Characters (ch >= 0x3000 && ch <= 0x303f) || // CJK Symbols and Punctuation (ch >= 0x3040 && ch <= 0x309f) || // Hiragana (ch >= 0x30a0 && ch <= 0x30ff) || // Katakana (ch >= 0x3100 && ch <= 0x312f) || // Bopomofo (ch >= 0x3130 && ch <= 0x318f) || // Hangul Compatibility Jamo (ch >= 0x31a0 && ch <= 0x31bf) || // Bopomofo Extended (ch >= 0xac00 && ch <= 0xd7af) || // Hangul Syllables (ch >= 0xff00 && ch <= 0xffef) // Halfwidth and Fullwidth Forms } // isUnicodeIdentifierStart checks if a character can start a Unicode identifier func (l *Lexer) isUnicodeIdentifierStart(ch rune) bool { return unicode.IsLetter(ch) || ch == '_' || l.isChineseChar(ch) } // isUnicodeIdentifierChar checks if a character can be part of a Unicode identifier func (l *Lexer) isUnicodeIdentifierChar(ch rune) bool { return unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' || ch == '-' || l.isChineseChar(ch) } // FilterTokens removes whitespace and comment tokens for parsing func FilterTokens(tokens []Token) []Token { filtered := make([]Token, 0, len(tokens)) for _, token := range tokens { if token.Type != TokenSpace && token.Type != TokenComment { filtered = append(filtered, token) } } return filtered }