// Package lexer provides lexical analysis for Mermaid diagram syntax. // Based on the lexical rules from flow.jison in mermaid.js package lexer import ( "fmt" _ "regexp" "strings" "unicode" ) // TokenType represents the type of a lexical token type TokenType int const ( // Special tokens TokenEOF TokenType = iota TokenNewline TokenSpace TokenComment // Keywords - from flow.jison TokenGraph TokenSubgraph TokenEnd TokenDirection TokenClass TokenClassDef TokenClick TokenStyle TokenLinkStyle TokenDefault // Directions TokenTD // Top Down TokenTB // Top Bottom TokenBT // Bottom Top TokenRL // Right Left TokenLR // Left Right // Identifiers and literals TokenID TokenString TokenNodeString TokenNumber TokenUnicodeText // Shape delimiters - following JISON patterns TokenOpenBracket // [ TokenCloseBracket // ] TokenOpenParen // ( TokenCloseParen // ) TokenOpenBrace // { TokenCloseBrace // } TokenOpenDoubleParen // (( TokenCloseDoubleParen // )) TokenOpenAngle // < TokenCloseAngle // > // Edge tokens - from destructLink logic in flowDb.ts TokenArrowSolid // --> TokenArrowDotted // -.-> TokenArrowThick // ==> TokenArrowOpen // --- TokenArrowPoint // --> TokenArrowCross // --x TokenArrowCircle // --o // ER diagram relationship tokens TokenEROneToMany // ||--o{ TokenERManyToOne // }o--|| TokenEROneToOne // ||--|| TokenERManyToMany // }o--o{ TokenERZeroToOne // ||--o| // Edge modifiers TokenPipe // | TokenColon // : TokenSemicolon // ; TokenComma // , TokenAmpersand // & TokenMult // * TokenPlus // + TokenMinus // - TokenEquals // = TokenDot // . TokenExclamation // ! // Error token TokenError ) // Token represents a lexical token type Token struct { Type TokenType Value string Line int Column int Position int } // String returns a string representation of the token func (t Token) String() string { return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}", tokenTypeNames[t.Type], t.Value, t.Line, t.Column) } var tokenTypeNames = map[TokenType]string{ TokenEOF: "EOF", TokenNewline: "NEWLINE", TokenSpace: "SPACE", TokenComment: "COMMENT", TokenGraph: "GRAPH", TokenSubgraph: "SUBGRAPH", TokenEnd: "END", TokenDirection: "DIRECTION", TokenClass: "CLASS", TokenClassDef: "CLASSDEF", TokenClick: "CLICK", TokenStyle: "STYLE", TokenLinkStyle: "LINKSTYLE", TokenDefault: "DEFAULT", TokenTD: "TD", TokenTB: "TB", TokenBT: "BT", TokenRL: "RL", TokenLR: "LR", TokenID: "ID", TokenString: "STRING", TokenNodeString: "NODE_STRING", TokenNumber: "NUMBER", TokenUnicodeText: "UNICODE_TEXT", TokenOpenBracket: "OPEN_BRACKET", TokenCloseBracket: "CLOSE_BRACKET", TokenOpenParen: "OPEN_PAREN", TokenCloseParen: "CLOSE_PAREN", TokenOpenBrace: "OPEN_BRACE", TokenCloseBrace: "CLOSE_BRACE", TokenOpenDoubleParen: "OPEN_DOUBLE_PAREN", TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN", TokenOpenAngle: "OPEN_ANGLE", TokenCloseAngle: "CLOSE_ANGLE", TokenArrowSolid: "ARROW_SOLID", TokenArrowDotted: "ARROW_DOTTED", TokenArrowThick: "ARROW_THICK", TokenArrowOpen: "ARROW_OPEN", TokenArrowPoint: "ARROW_POINT", TokenArrowCross: "ARROW_CROSS", TokenArrowCircle: "ARROW_CIRCLE", TokenEROneToMany: "ER_ONE_TO_MANY", TokenERManyToOne: "ER_MANY_TO_ONE", TokenEROneToOne: "ER_ONE_TO_ONE", TokenERManyToMany: "ER_MANY_TO_MANY", TokenERZeroToOne: "ER_ZERO_TO_ONE", TokenPipe: "PIPE", TokenColon: "COLON", TokenSemicolon: "SEMICOLON", TokenComma: "COMMA", TokenAmpersand: "AMPERSAND", TokenMult: "MULT", TokenMinus: "MINUS", TokenEquals: "EQUALS", TokenDot: "DOT", TokenError: "ERROR", } // Lexer performs lexical analysis on mermaid input type Lexer struct { input string position int line int column int tokens []Token } // NewLexer creates a new lexer for the given input func NewLexer(input string) *Lexer { return &Lexer{ input: input, line: 1, column: 1, tokens: make([]Token, 0), } } // Tokenize performs lexical analysis and returns all tokens func (l *Lexer) Tokenize() ([]Token, error) { for l.position < len(l.input) { if err := l.nextToken(); err != nil { return nil, err } } // Add EOF token l.addToken(TokenEOF, "") return l.tokens, nil } // nextToken processes the next token from input func (l *Lexer) nextToken() error { if l.position >= len(l.input) { return nil } ch := l.current() // Skip whitespace but track newlines if unicode.IsSpace(ch) { return l.consumeWhitespace() } // Comments - following mermaid.js pattern if ch == '%' && l.peek() == '%' { return l.consumeComment() } // Multi-character operators first (order matters!) if multiChar := l.tryMultiCharOperator(); multiChar != TokenError { return nil } // Keywords and identifiers if unicode.IsLetter(ch) || ch == '_' { return l.consumeIdentifier() } // Numbers if unicode.IsDigit(ch) { return l.consumeNumber() } // Strings if ch == '"' { return l.consumeString() } // Single character tokens return l.consumeSingleChar() } // tryMultiCharOperator attempts to match multi-character operators func (l *Lexer) tryMultiCharOperator() TokenType { // Check for ER diagram relationships first (need to be before shorter patterns) if l.matchString("||--o{") { l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6) return TokenEROneToMany } if l.matchString("}o--||") { l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6) return TokenERManyToOne } if l.matchString("||--||") { l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6) return TokenEROneToOne } if l.matchString("}o--o{") { l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6) return TokenERManyToMany } if l.matchString("||--o|") { l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6) return TokenERZeroToOne } // Check for sequence diagram arrows if l.matchString("->>") { l.addTokenAndAdvance(TokenArrowSolid, "->>", 3) return TokenArrowSolid } if l.matchString("-->>") { l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4) return TokenArrowDotted } // Check for arrows - based on destructLink patterns if l.matchString("==>") { l.addTokenAndAdvance(TokenArrowThick, "==>", 3) return TokenArrowThick } if l.matchString("-->") { l.addTokenAndAdvance(TokenArrowSolid, "-->", 3) return TokenArrowSolid } if l.matchString("-.->") { l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4) return TokenArrowDotted } if l.matchString("--x") { l.addTokenAndAdvance(TokenArrowCross, "--x", 3) return TokenArrowCross } if l.matchString("--o") { l.addTokenAndAdvance(TokenArrowCircle, "--o", 3) return TokenArrowCircle } if l.matchString("---") { l.addTokenAndAdvance(TokenArrowOpen, "---", 3) return TokenArrowOpen } if l.matchString("((") { l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2) return TokenOpenDoubleParen } if l.matchString("))") { l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2) return TokenCloseDoubleParen } return TokenError } // consumeWhitespace consumes whitespace characters func (l *Lexer) consumeWhitespace() error { start := l.position for l.position < len(l.input) && unicode.IsSpace(l.current()) { if l.current() == '\n' { l.line++ l.column = 1 l.position++ // Add newline token for significant newlines if start < l.position-1 { l.addToken(TokenSpace, l.input[start:l.position-1]) } l.addToken(TokenNewline, "\n") return nil } else { l.advance() } } if start < l.position { l.addToken(TokenSpace, l.input[start:l.position]) } return nil } // consumeComment consumes a comment line func (l *Lexer) consumeComment() error { start := l.position // Skip %% l.advance() l.advance() // Read until end of line for l.position < len(l.input) && l.current() != '\n' { l.advance() } l.addToken(TokenComment, l.input[start:l.position]) return nil } // consumeIdentifier consumes identifiers and keywords func (l *Lexer) consumeIdentifier() error { start := l.position // First character already validated l.advance() // Continue with alphanumeric and underscore for l.position < len(l.input) { ch := l.current() if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' { l.advance() } else { break } } value := l.input[start:l.position] tokenType := l.getKeywordType(value) l.addToken(tokenType, value) return nil } // getKeywordType returns the token type for keywords, or TokenID for identifiers func (l *Lexer) getKeywordType(value string) TokenType { // Keywords from flow.jison switch strings.ToLower(value) { case "graph": return TokenGraph case "flowchart": return TokenGraph // flowchart uses same token as graph case "subgraph": return TokenSubgraph case "end": return TokenEnd case "class": return TokenClass case "classdef": return TokenClassDef case "click": return TokenClick case "style": return TokenStyle case "linkstyle": return TokenLinkStyle case "default": return TokenDefault // Direction keywords case "td": return TokenTD case "tb": return TokenTB case "bt": return TokenBT case "rl": return TokenRL case "lr": return TokenLR default: return TokenID } } // consumeNumber consumes numeric literals func (l *Lexer) consumeNumber() error { start := l.position for l.position < len(l.input) && unicode.IsDigit(l.current()) { l.advance() } // Handle decimal point if l.position < len(l.input) && l.current() == '.' { l.advance() for l.position < len(l.input) && unicode.IsDigit(l.current()) { l.advance() } } l.addToken(TokenNumber, l.input[start:l.position]) return nil } // consumeString consumes quoted string literals func (l *Lexer) consumeString() error { start := l.position // Skip opening quote l.advance() for l.position < len(l.input) && l.current() != '"' { if l.current() == '\\' && l.position+1 < len(l.input) { // Skip escaped character l.advance() l.advance() } else { l.advance() } } if l.position >= len(l.input) { return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column) } // Skip closing quote l.advance() l.addToken(TokenString, l.input[start:l.position]) return nil } // consumeSingleChar handles single character tokens func (l *Lexer) consumeSingleChar() error { ch := l.current() var tokenType TokenType switch ch { case '[': tokenType = TokenOpenBracket case ']': tokenType = TokenCloseBracket case '(': tokenType = TokenOpenParen case ')': tokenType = TokenCloseParen case '{': tokenType = TokenOpenBrace case '}': tokenType = TokenCloseBrace case '<': tokenType = TokenOpenAngle case '>': tokenType = TokenCloseAngle case '|': tokenType = TokenPipe case ':': tokenType = TokenColon case ';': tokenType = TokenSemicolon case ',': tokenType = TokenComma case '&': tokenType = TokenAmpersand case '*': tokenType = TokenMult case '+': tokenType = TokenPlus case '-': tokenType = TokenMinus case '=': tokenType = TokenEquals case '.': tokenType = TokenDot case '!': tokenType = TokenExclamation default: return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column) } l.addTokenAndAdvance(tokenType, string(ch), 1) return nil } // Helper methods // current returns the current character func (l *Lexer) current() rune { if l.position >= len(l.input) { return 0 } return rune(l.input[l.position]) } // peek returns the next character without advancing func (l *Lexer) peek() rune { if l.position+1 >= len(l.input) { return 0 } return rune(l.input[l.position+1]) } // advance moves to the next character func (l *Lexer) advance() { if l.position < len(l.input) { if l.input[l.position] == '\n' { l.line++ l.column = 1 } else { l.column++ } l.position++ } } // matchString checks if the input matches the given string at current position func (l *Lexer) matchString(s string) bool { if l.position+len(s) > len(l.input) { return false } return l.input[l.position:l.position+len(s)] == s } // addToken adds a token to the token list func (l *Lexer) addToken(tokenType TokenType, value string) { token := Token{ Type: tokenType, Value: value, Line: l.line, Column: l.column - len(value), Position: l.position - len(value), } l.tokens = append(l.tokens, token) } // addTokenAndAdvance adds a token and advances position func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, length int) { l.addToken(tokenType, value) for i := 0; i < length; i++ { l.advance() } } // FilterTokens removes whitespace and comment tokens for parsing func FilterTokens(tokens []Token) []Token { filtered := make([]Token, 0, len(tokens)) for _, token := range tokens { if token.Type != TokenSpace && token.Type != TokenComment { filtered = append(filtered, token) } } return filtered }