| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616 |
- // Package lexer provides lexical analysis for Mermaid diagram syntax.
- // Based on the lexical rules from flow.jison in mermaid.js
- package lexer
- import (
- "fmt"
- _ "regexp"
- "strings"
- "unicode"
- )
- // TokenType represents the type of a lexical token
- type TokenType int
- const (
- // Special tokens
- TokenEOF TokenType = iota
- TokenNewline
- TokenSpace
- TokenComment
- // Keywords - from flow.jison
- TokenGraph
- TokenSubgraph
- TokenEnd
- TokenDirection
- TokenClass
- TokenClassDef
- TokenClick
- TokenStyle
- TokenLinkStyle
- TokenDefault
- // Directions
- TokenTD // Top Down
- TokenTB // Top Bottom
- TokenBT // Bottom Top
- TokenRL // Right Left
- TokenLR // Left Right
- // Identifiers and literals
- TokenID
- TokenString
- TokenNodeString
- TokenNumber
- TokenUnicodeText
- // Shape delimiters - following JISON patterns
- TokenOpenBracket // [
- TokenCloseBracket // ]
- TokenOpenParen // (
- TokenCloseParen // )
- TokenOpenBrace // {
- TokenCloseBrace // }
- TokenOpenDoubleParen // ((
- TokenCloseDoubleParen // ))
- TokenOpenAngle // <
- TokenCloseAngle // >
- // Edge tokens - from destructLink logic in flowDb.ts
- TokenArrowSolid // -->
- TokenArrowDotted // -.->
- TokenArrowThick // ==>
- TokenArrowOpen // ---
- TokenArrowPoint // -->
- TokenArrowCross // --x
- TokenArrowCircle // --o
- // ER diagram relationship tokens
- TokenEROneToMany // ||--o{
- TokenERManyToOne // }o--||
- TokenEROneToOne // ||--||
- TokenERManyToMany // }o--o{
- TokenERZeroToOne // ||--o|
- // Edge modifiers
- TokenPipe // |
- TokenColon // :
- TokenSemicolon // ;
- TokenComma // ,
- TokenAmpersand // &
- TokenMult // *
- TokenPlus // +
- TokenMinus // -
- TokenEquals // =
- TokenDot // .
- TokenExclamation // !
- TokenSlash // /
- TokenBackslash // \
- TokenHash // #
- TokenAt // @
- TokenPercent // %
- TokenTilde // ~
- TokenQuestion // ?
- // Error token
- TokenError
- )
- // Token represents a lexical token
- type Token struct {
- Type TokenType
- Value string
- Line int
- Column int
- Position int
- }
- // String returns a string representation of the token
- func (t Token) String() string {
- return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}",
- tokenTypeNames[t.Type], t.Value, t.Line, t.Column)
- }
- var tokenTypeNames = map[TokenType]string{
- TokenEOF: "EOF",
- TokenNewline: "NEWLINE",
- TokenSpace: "SPACE",
- TokenComment: "COMMENT",
- TokenGraph: "GRAPH",
- TokenSubgraph: "SUBGRAPH",
- TokenEnd: "END",
- TokenDirection: "DIRECTION",
- TokenClass: "CLASS",
- TokenClassDef: "CLASSDEF",
- TokenClick: "CLICK",
- TokenStyle: "STYLE",
- TokenLinkStyle: "LINKSTYLE",
- TokenDefault: "DEFAULT",
- TokenTD: "TD",
- TokenTB: "TB",
- TokenBT: "BT",
- TokenRL: "RL",
- TokenLR: "LR",
- TokenID: "ID",
- TokenString: "STRING",
- TokenNodeString: "NODE_STRING",
- TokenNumber: "NUMBER",
- TokenUnicodeText: "UNICODE_TEXT",
- TokenOpenBracket: "OPEN_BRACKET",
- TokenCloseBracket: "CLOSE_BRACKET",
- TokenOpenParen: "OPEN_PAREN",
- TokenCloseParen: "CLOSE_PAREN",
- TokenOpenBrace: "OPEN_BRACE",
- TokenCloseBrace: "CLOSE_BRACE",
- TokenOpenDoubleParen: "OPEN_DOUBLE_PAREN",
- TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN",
- TokenOpenAngle: "OPEN_ANGLE",
- TokenCloseAngle: "CLOSE_ANGLE",
- TokenArrowSolid: "ARROW_SOLID",
- TokenArrowDotted: "ARROW_DOTTED",
- TokenArrowThick: "ARROW_THICK",
- TokenArrowOpen: "ARROW_OPEN",
- TokenArrowPoint: "ARROW_POINT",
- TokenArrowCross: "ARROW_CROSS",
- TokenArrowCircle: "ARROW_CIRCLE",
- TokenEROneToMany: "ER_ONE_TO_MANY",
- TokenERManyToOne: "ER_MANY_TO_ONE",
- TokenEROneToOne: "ER_ONE_TO_ONE",
- TokenERManyToMany: "ER_MANY_TO_MANY",
- TokenERZeroToOne: "ER_ZERO_TO_ONE",
- TokenPipe: "PIPE",
- TokenColon: "COLON",
- TokenSemicolon: "SEMICOLON",
- TokenComma: "COMMA",
- TokenAmpersand: "AMPERSAND",
- TokenMult: "MULT",
- TokenPlus: "PLUS",
- TokenMinus: "MINUS",
- TokenEquals: "EQUALS",
- TokenDot: "DOT",
- TokenExclamation: "EXCLAMATION",
- TokenSlash: "SLASH",
- TokenBackslash: "BACKSLASH",
- TokenHash: "HASH",
- TokenAt: "AT",
- TokenPercent: "PERCENT",
- TokenTilde: "TILDE",
- TokenQuestion: "QUESTION",
- TokenError: "ERROR",
- }
- // Lexer performs lexical analysis on mermaid input
- type Lexer struct {
- input string
- position int
- line int
- column int
- tokens []Token
- }
- // NewLexer creates a new lexer for the given input
- func NewLexer(input string) *Lexer {
- return &Lexer{
- input: input,
- line: 1,
- column: 1,
- tokens: make([]Token, 0),
- }
- }
- // Tokenize performs lexical analysis and returns all tokens
- func (l *Lexer) Tokenize() ([]Token, error) {
- for l.position < len(l.input) {
- if err := l.nextToken(); err != nil {
- return nil, err
- }
- }
- // Add EOF token
- l.addToken(TokenEOF, "")
- return l.tokens, nil
- }
- // nextToken processes the next token from input
- func (l *Lexer) nextToken() error {
- if l.position >= len(l.input) {
- return nil
- }
- ch := l.current()
- // Skip whitespace but track newlines
- if unicode.IsSpace(ch) {
- return l.consumeWhitespace()
- }
- // Comments - following mermaid.js pattern
- if ch == '%' && l.peek() == '%' {
- return l.consumeComment()
- }
- // Multi-character operators first (order matters!)
- if multiChar := l.tryMultiCharOperator(); multiChar != TokenError {
- return nil
- }
- // Keywords and identifiers
- if unicode.IsLetter(ch) || ch == '_' {
- return l.consumeIdentifier()
- }
- // Numbers
- if unicode.IsDigit(ch) {
- return l.consumeNumber()
- }
- // Strings
- if ch == '"' {
- return l.consumeString()
- }
- // Single character tokens
- return l.consumeSingleChar()
- }
- // tryMultiCharOperator attempts to match multi-character operators
- func (l *Lexer) tryMultiCharOperator() TokenType {
- // Check for ER diagram relationships first (need to be before shorter patterns)
- if l.matchString("||--o{") {
- l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6)
- return TokenEROneToMany
- }
- if l.matchString("}o--||") {
- l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6)
- return TokenERManyToOne
- }
- if l.matchString("||--||") {
- l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6)
- return TokenEROneToOne
- }
- if l.matchString("}o--o{") {
- l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6)
- return TokenERManyToMany
- }
- if l.matchString("||--o|") {
- l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6)
- return TokenERZeroToOne
- }
- // Check for sequence diagram arrows
- if l.matchString("->>") {
- l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
- return TokenArrowSolid
- }
- if l.matchString("-->>") {
- l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4)
- return TokenArrowDotted
- }
- // Check for arrows - based on destructLink patterns
- if l.matchString("==>") {
- l.addTokenAndAdvance(TokenArrowThick, "==>", 3)
- return TokenArrowThick
- }
- if l.matchString("-->") {
- l.addTokenAndAdvance(TokenArrowSolid, "-->", 3)
- return TokenArrowSolid
- }
- if l.matchString("-.->") {
- l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4)
- return TokenArrowDotted
- }
- if l.matchString("--x") {
- l.addTokenAndAdvance(TokenArrowCross, "--x", 3)
- return TokenArrowCross
- }
- if l.matchString("--o") {
- l.addTokenAndAdvance(TokenArrowCircle, "--o", 3)
- return TokenArrowCircle
- }
- if l.matchString("---") {
- l.addTokenAndAdvance(TokenArrowOpen, "---", 3)
- return TokenArrowOpen
- }
- if l.matchString("((") {
- l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2)
- return TokenOpenDoubleParen
- }
- if l.matchString("))") {
- l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2)
- return TokenCloseDoubleParen
- }
- return TokenError
- }
- // consumeWhitespace consumes whitespace characters
- func (l *Lexer) consumeWhitespace() error {
- start := l.position
- for l.position < len(l.input) && unicode.IsSpace(l.current()) {
- if l.current() == '\n' {
- l.line++
- l.column = 1
- l.position++
- // Add newline token for significant newlines
- if start < l.position-1 {
- l.addToken(TokenSpace, l.input[start:l.position-1])
- }
- l.addToken(TokenNewline, "\n")
- return nil
- } else {
- l.advance()
- }
- }
- if start < l.position {
- l.addToken(TokenSpace, l.input[start:l.position])
- }
- return nil
- }
- // consumeComment consumes a comment line
- func (l *Lexer) consumeComment() error {
- start := l.position
- // Skip %%
- l.advance()
- l.advance()
- // Read until end of line
- for l.position < len(l.input) && l.current() != '\n' {
- l.advance()
- }
- l.addToken(TokenComment, l.input[start:l.position])
- return nil
- }
- // consumeIdentifier consumes identifiers and keywords
- func (l *Lexer) consumeIdentifier() error {
- start := l.position
- // First character already validated
- l.advance()
- // Continue with alphanumeric and underscore
- for l.position < len(l.input) {
- ch := l.current()
- if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' {
- l.advance()
- } else {
- break
- }
- }
- value := l.input[start:l.position]
- tokenType := l.getKeywordType(value)
- l.addToken(tokenType, value)
- return nil
- }
- // getKeywordType returns the token type for keywords, or TokenID for identifiers
- func (l *Lexer) getKeywordType(value string) TokenType {
- // Keywords from flow.jison
- switch strings.ToLower(value) {
- case "graph":
- return TokenGraph
- case "flowchart":
- return TokenGraph // flowchart uses same token as graph
- case "subgraph":
- return TokenSubgraph
- case "end":
- return TokenEnd
- case "class":
- return TokenClass
- case "classdef":
- return TokenClassDef
- case "click":
- return TokenClick
- case "style":
- return TokenStyle
- case "linkstyle":
- return TokenLinkStyle
- case "default":
- return TokenDefault
- // Direction keywords
- case "td":
- return TokenTD
- case "tb":
- return TokenTB
- case "bt":
- return TokenBT
- case "rl":
- return TokenRL
- case "lr":
- return TokenLR
- default:
- return TokenID
- }
- }
- // consumeNumber consumes numeric literals
- func (l *Lexer) consumeNumber() error {
- start := l.position
- for l.position < len(l.input) && unicode.IsDigit(l.current()) {
- l.advance()
- }
- // Handle decimal point
- if l.position < len(l.input) && l.current() == '.' {
- l.advance()
- for l.position < len(l.input) && unicode.IsDigit(l.current()) {
- l.advance()
- }
- }
- l.addToken(TokenNumber, l.input[start:l.position])
- return nil
- }
- // consumeString consumes quoted string literals
- func (l *Lexer) consumeString() error {
- start := l.position
- // Skip opening quote
- l.advance()
- for l.position < len(l.input) && l.current() != '"' {
- if l.current() == '\\' && l.position+1 < len(l.input) {
- // Skip escaped character
- l.advance()
- l.advance()
- } else {
- l.advance()
- }
- }
- if l.position >= len(l.input) {
- return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column)
- }
- // Skip closing quote
- l.advance()
- l.addToken(TokenString, l.input[start:l.position])
- return nil
- }
- // consumeSingleChar handles single character tokens
- func (l *Lexer) consumeSingleChar() error {
- ch := l.current()
- var tokenType TokenType
- switch ch {
- case '[':
- tokenType = TokenOpenBracket
- case ']':
- tokenType = TokenCloseBracket
- case '(':
- tokenType = TokenOpenParen
- case ')':
- tokenType = TokenCloseParen
- case '{':
- tokenType = TokenOpenBrace
- case '}':
- tokenType = TokenCloseBrace
- case '<':
- tokenType = TokenOpenAngle
- case '>':
- tokenType = TokenCloseAngle
- case '|':
- tokenType = TokenPipe
- case ':':
- tokenType = TokenColon
- case ';':
- tokenType = TokenSemicolon
- case ',':
- tokenType = TokenComma
- case '&':
- tokenType = TokenAmpersand
- case '*':
- tokenType = TokenMult
- case '+':
- tokenType = TokenPlus
- case '-':
- tokenType = TokenMinus
- case '=':
- tokenType = TokenEquals
- case '.':
- tokenType = TokenDot
- case '!':
- tokenType = TokenExclamation
- case '/':
- tokenType = TokenSlash
- case '\\':
- tokenType = TokenBackslash
- case '#':
- tokenType = TokenHash
- case '@':
- tokenType = TokenAt
- case '%':
- tokenType = TokenPercent
- case '~':
- tokenType = TokenTilde
- case '?':
- tokenType = TokenQuestion
- default:
- return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column)
- }
- l.addTokenAndAdvance(tokenType, string(ch), 1)
- return nil
- }
- // Helper methods
- // current returns the current character
- func (l *Lexer) current() rune {
- if l.position >= len(l.input) {
- return 0
- }
- return rune(l.input[l.position])
- }
- // peek returns the next character without advancing
- func (l *Lexer) peek() rune {
- if l.position+1 >= len(l.input) {
- return 0
- }
- return rune(l.input[l.position+1])
- }
- // advance moves to the next character
- func (l *Lexer) advance() {
- if l.position < len(l.input) {
- if l.input[l.position] == '\n' {
- l.line++
- l.column = 1
- } else {
- l.column++
- }
- l.position++
- }
- }
- // matchString checks if the input matches the given string at current position
- func (l *Lexer) matchString(s string) bool {
- if l.position+len(s) > len(l.input) {
- return false
- }
- return l.input[l.position:l.position+len(s)] == s
- }
- // addToken adds a token to the token list
- func (l *Lexer) addToken(tokenType TokenType, value string) {
- token := Token{
- Type: tokenType,
- Value: value,
- Line: l.line,
- Column: l.column - len(value),
- Position: l.position - len(value),
- }
- l.tokens = append(l.tokens, token)
- }
- // addTokenAndAdvance adds a token and advances position
- func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, length int) {
- l.addToken(tokenType, value)
- for i := 0; i < length; i++ {
- l.advance()
- }
- }
- // FilterTokens removes whitespace and comment tokens for parsing
- func FilterTokens(tokens []Token) []Token {
- filtered := make([]Token, 0, len(tokens))
- for _, token := range tokens {
- if token.Type != TokenSpace && token.Type != TokenComment {
- filtered = append(filtered, token)
- }
- }
- return filtered
- }
|