projs
/
mermaid-go


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
							// Package lexer provides lexical analysis for Mermaid diagram syntax.
// Based on the lexical rules from flow.jison in mermaid.js
package lexer

import (
	"fmt"
	_ "regexp"
	"strings"
	"unicode"
)

// TokenType represents the type of a lexical token
type TokenType int

const (
	// Special tokens
	TokenEOF TokenType = iota
	TokenNewline
	TokenSpace
	TokenComment

	// Keywords - from flow.jison
	TokenGraph
	TokenSubgraph
	TokenEnd
	TokenDirection
	TokenClass
	TokenClassDef
	TokenClick
	TokenStyle
	TokenLinkStyle
	TokenDefault

	// Directions
	TokenTD // Top Down
	TokenTB // Top Bottom
	TokenBT // Bottom Top
	TokenRL // Right Left
	TokenLR // Left Right

	// Identifiers and literals
	TokenID
	TokenString
	TokenNodeString
	TokenNumber
	TokenUnicodeText

	// Shape delimiters - following JISON patterns
	TokenOpenBracket      // [
	TokenCloseBracket     // ]
	TokenOpenParen        // (
	TokenCloseParen       // )
	TokenOpenBrace        // {
	TokenCloseBrace       // }
	TokenOpenDoubleParen  // ((
	TokenCloseDoubleParen // ))
	TokenOpenAngle        // <
	TokenCloseAngle       // >

	// Edge tokens - from destructLink logic in flowDb.ts
	TokenArrowSolid  // -->
	TokenArrowDotted // -.->
	TokenArrowThick  // ==>
	TokenArrowOpen   // ---
	TokenArrowPoint  // -->
	TokenArrowCross  // --x
	TokenArrowCircle // --o

	// ER diagram relationship tokens
	TokenEROneToMany  // ||--o{
	TokenERManyToOne  // }o--||
	TokenEROneToOne   // ||--||
	TokenERManyToMany // }o--o{
	TokenERZeroToOne  // ||--o|

	// Edge modifiers
	TokenPipe        // |
	TokenColon       // :
	TokenSemicolon   // ;
	TokenComma       // ,
	TokenAmpersand   // &
	TokenMult        // *
	TokenPlus        // +
	TokenMinus       // -
	TokenEquals      // =
	TokenDot         // .
	TokenExclamation // !
	TokenSlash       // /
	TokenBackslash   // \
	TokenHash        // #
	TokenAt          // @
	TokenPercent     // %
	TokenTilde       // ~
	TokenQuestion    // ?

	// Error token
	TokenError
)

// Token represents a lexical token
type Token struct {
	Type     TokenType
	Value    string
	Line     int
	Column   int
	Position int
}

// String returns a string representation of the token
func (t Token) String() string {
	return fmt.Sprintf("Token{Type: %s, Value: %q, Line: %d, Col: %d}",
		t.Type.String(), t.Value, t.Line, t.Column)
}

// String returns the string representation of TokenType
func (tt TokenType) String() string {
	if name, exists := tokenTypeNames[tt]; exists {
		return name
	}
	return fmt.Sprintf("TokenType(%d)", int(tt))
}

var tokenTypeNames = map[TokenType]string{
	TokenEOF:              "EOF",
	TokenNewline:          "NEWLINE",
	TokenSpace:            "SPACE",
	TokenComment:          "COMMENT",
	TokenGraph:            "GRAPH",
	TokenSubgraph:         "SUBGRAPH",
	TokenEnd:              "END",
	TokenDirection:        "DIRECTION",
	TokenClass:            "CLASS",
	TokenClassDef:         "CLASSDEF",
	TokenClick:            "CLICK",
	TokenStyle:            "STYLE",
	TokenLinkStyle:        "LINKSTYLE",
	TokenDefault:          "DEFAULT",
	TokenTD:               "TD",
	TokenTB:               "TB",
	TokenBT:               "BT",
	TokenRL:               "RL",
	TokenLR:               "LR",
	TokenID:               "ID",
	TokenString:           "STRING",
	TokenNodeString:       "NODE_STRING",
	TokenNumber:           "NUMBER",
	TokenUnicodeText:      "UNICODE_TEXT",
	TokenOpenBracket:      "OPEN_BRACKET",
	TokenCloseBracket:     "CLOSE_BRACKET",
	TokenOpenParen:        "OPEN_PAREN",
	TokenCloseParen:       "CLOSE_PAREN",
	TokenOpenBrace:        "OPEN_BRACE",
	TokenCloseBrace:       "CLOSE_BRACE",
	TokenOpenDoubleParen:  "OPEN_DOUBLE_PAREN",
	TokenCloseDoubleParen: "CLOSE_DOUBLE_PAREN",
	TokenOpenAngle:        "OPEN_ANGLE",
	TokenCloseAngle:       "CLOSE_ANGLE",
	TokenArrowSolid:       "ARROW_SOLID",
	TokenArrowDotted:      "ARROW_DOTTED",
	TokenArrowThick:       "ARROW_THICK",
	TokenArrowOpen:        "ARROW_OPEN",
	TokenArrowPoint:       "ARROW_POINT",
	TokenArrowCross:       "ARROW_CROSS",
	TokenArrowCircle:      "ARROW_CIRCLE",
	TokenEROneToMany:      "ER_ONE_TO_MANY",
	TokenERManyToOne:      "ER_MANY_TO_ONE",
	TokenEROneToOne:       "ER_ONE_TO_ONE",
	TokenERManyToMany:     "ER_MANY_TO_MANY",
	TokenERZeroToOne:      "ER_ZERO_TO_ONE",
	TokenPipe:             "PIPE",
	TokenColon:            "COLON",
	TokenSemicolon:        "SEMICOLON",
	TokenComma:            "COMMA",
	TokenAmpersand:        "AMPERSAND",
	TokenMult:             "MULT",
	TokenPlus:             "PLUS",
	TokenMinus:            "MINUS",
	TokenEquals:           "EQUALS",
	TokenDot:              "DOT",
	TokenExclamation:      "EXCLAMATION",
	TokenSlash:            "SLASH",
	TokenBackslash:        "BACKSLASH",
	TokenHash:             "HASH",
	TokenAt:               "AT",
	TokenPercent:          "PERCENT",
	TokenTilde:            "TILDE",
	TokenQuestion:         "QUESTION",
	TokenError:            "ERROR",
}

// Lexer performs lexical analysis on mermaid input
type Lexer struct {
	input    string
	position int
	line     int
	column   int
	tokens   []Token
}

// NewLexer creates a new lexer for the given input
func NewLexer(input string) *Lexer {
	return &Lexer{
		input:  input,
		line:   1,
		column: 1,
		tokens: make([]Token, 0),
	}
}

// Tokenize performs lexical analysis and returns all tokens
func (l *Lexer) Tokenize() ([]Token, error) {
	for l.position < len(l.input) {
		if err := l.nextToken(); err != nil {
			return nil, err
		}
	}

	// Add EOF token
	l.addToken(TokenEOF, "")
	return l.tokens, nil
}

// nextToken processes the next token from input
func (l *Lexer) nextToken() error {
	if l.position >= len(l.input) {
		return nil
	}

	ch := l.current()

	// Skip whitespace but track newlines
	if unicode.IsSpace(ch) {
		return l.consumeWhitespace()
	}

	// Comments - following mermaid.js pattern
	if ch == '%' && l.peek() == '%' {
		return l.consumeComment()
	}

	// Multi-character operators first (order matters!)
	if multiChar := l.tryMultiCharOperator(); multiChar != TokenError {
		return nil
	}

	// Keywords and identifiers
	if unicode.IsLetter(ch) || ch == '_' {
		return l.consumeIdentifier()
	}

	// Numbers
	if unicode.IsDigit(ch) {
		return l.consumeNumber()
	}

	// Strings
	if ch == '"' {
		return l.consumeString()
	}

	// Single character tokens
	return l.consumeSingleChar()
}

// tryMultiCharOperator attempts to match multi-character operators
func (l *Lexer) tryMultiCharOperator() TokenType {
	// Check for ER diagram relationships first (need to be before shorter patterns)
	if l.matchString("||--o{") {
		l.addTokenAndAdvance(TokenEROneToMany, "||--o{", 6)
		return TokenEROneToMany
	}
	if l.matchString("}o--||") {
		l.addTokenAndAdvance(TokenERManyToOne, "}o--||", 6)
		return TokenERManyToOne
	}
	if l.matchString("||--||") {
		l.addTokenAndAdvance(TokenEROneToOne, "||--||", 6)
		return TokenEROneToOne
	}
	if l.matchString("}o--o{") {
		l.addTokenAndAdvance(TokenERManyToMany, "}o--o{", 6)
		return TokenERManyToMany
	}
	if l.matchString("||--o|") {
		l.addTokenAndAdvance(TokenERZeroToOne, "||--o|", 6)
		return TokenERZeroToOne
	}

	// Check for sequence diagram arrows
	if l.matchString("->>") {
		l.addTokenAndAdvance(TokenArrowSolid, "->>", 3)
		return TokenArrowSolid
	}
	if l.matchString("-->>") {
		l.addTokenAndAdvance(TokenArrowDotted, "-->>", 4)
		return TokenArrowDotted
	}

	// Check for arrows - based on destructLink patterns
	if l.matchString("==>") {
		l.addTokenAndAdvance(TokenArrowThick, "==>", 3)
		return TokenArrowThick
	}
	if l.matchString("-->") {
		l.addTokenAndAdvance(TokenArrowSolid, "-->", 3)
		return TokenArrowSolid
	}
	if l.matchString("-.->") {
		l.addTokenAndAdvance(TokenArrowDotted, "-.->", 4)
		return TokenArrowDotted
	}
	if l.matchString("--x") {
		l.addTokenAndAdvance(TokenArrowCross, "--x", 3)
		return TokenArrowCross
	}
	if l.matchString("--o") {
		l.addTokenAndAdvance(TokenArrowCircle, "--o", 3)
		return TokenArrowCircle
	}
	if l.matchString("---") {
		l.addTokenAndAdvance(TokenArrowOpen, "---", 3)
		return TokenArrowOpen
	}
	if l.matchString("((") {
		l.addTokenAndAdvance(TokenOpenDoubleParen, "((", 2)
		return TokenOpenDoubleParen
	}
	if l.matchString("))") {
		l.addTokenAndAdvance(TokenCloseDoubleParen, "))", 2)
		return TokenCloseDoubleParen
	}

	return TokenError
}

// consumeWhitespace consumes whitespace characters
func (l *Lexer) consumeWhitespace() error {
	start := l.position

	for l.position < len(l.input) && unicode.IsSpace(l.current()) {
		if l.current() == '\n' {
			l.line++
			l.column = 1
			l.position++
			// Add newline token for significant newlines
			if start < l.position-1 {
				l.addToken(TokenSpace, l.input[start:l.position-1])
			}
			l.addToken(TokenNewline, "\n")
			return nil
		} else {
			l.advance()
		}
	}

	if start < l.position {
		l.addToken(TokenSpace, l.input[start:l.position])
	}
	return nil
}

// consumeComment consumes a comment line
func (l *Lexer) consumeComment() error {
	start := l.position

	// Skip %%
	l.advance()
	l.advance()

	// Read until end of line
	for l.position < len(l.input) && l.current() != '\n' {
		l.advance()
	}

	l.addToken(TokenComment, l.input[start:l.position])
	return nil
}

// consumeIdentifier consumes identifiers and keywords
func (l *Lexer) consumeIdentifier() error {
	start := l.position

	// First character already validated
	l.advance()

	// Continue with alphanumeric and underscore
	for l.position < len(l.input) {
		ch := l.current()
		if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' {
			l.advance()
		} else {
			break
		}
	}

	value := l.input[start:l.position]
	tokenType := l.getKeywordType(value)
	l.addToken(tokenType, value)
	return nil
}

// getKeywordType returns the token type for keywords, or TokenID for identifiers
func (l *Lexer) getKeywordType(value string) TokenType {
	// Keywords from flow.jison
	switch strings.ToLower(value) {
	case "graph":
		return TokenGraph
	case "flowchart":
		return TokenGraph // flowchart uses same token as graph
	case "subgraph":
		return TokenSubgraph
	case "end":
		return TokenEnd
	case "class":
		return TokenClass
	case "classdef":
		return TokenClassDef
	case "click":
		return TokenClick
	case "style":
		return TokenStyle
	case "linkstyle":
		return TokenLinkStyle
	case "default":
		return TokenDefault
	// Direction keywords
	case "td":
		return TokenTD
	case "tb":
		return TokenTB
	case "bt":
		return TokenBT
	case "rl":
		return TokenRL
	case "lr":
		return TokenLR
	default:
		return TokenID
	}
}

// consumeNumber consumes numeric literals
func (l *Lexer) consumeNumber() error {
	start := l.position

	for l.position < len(l.input) && unicode.IsDigit(l.current()) {
		l.advance()
	}

	// Handle decimal point
	if l.position < len(l.input) && l.current() == '.' {
		l.advance()
		for l.position < len(l.input) && unicode.IsDigit(l.current()) {
			l.advance()
		}
	}

	l.addToken(TokenNumber, l.input[start:l.position])
	return nil
}

// consumeString consumes quoted string literals
func (l *Lexer) consumeString() error {
	start := l.position

	// Skip opening quote
	l.advance()

	for l.position < len(l.input) && l.current() != '"' {
		if l.current() == '\\' && l.position+1 < len(l.input) {
			// Skip escaped character
			l.advance()
			l.advance()
		} else {
			l.advance()
		}
	}

	if l.position >= len(l.input) {
		return fmt.Errorf("unterminated string at line %d, column %d", l.line, l.column)
	}

	// Skip closing quote
	l.advance()

	l.addToken(TokenString, l.input[start:l.position])
	return nil
}

// consumeSingleChar handles single character tokens
func (l *Lexer) consumeSingleChar() error {
	ch := l.current()

	var tokenType TokenType
	switch ch {
	case '[':
		tokenType = TokenOpenBracket
	case ']':
		tokenType = TokenCloseBracket
	case '(':
		tokenType = TokenOpenParen
	case ')':
		tokenType = TokenCloseParen
	case '{':
		tokenType = TokenOpenBrace
	case '}':
		tokenType = TokenCloseBrace
	case '<':
		tokenType = TokenOpenAngle
	case '>':
		tokenType = TokenCloseAngle
	case '|':
		tokenType = TokenPipe
	case ':':
		tokenType = TokenColon
	case ';':
		tokenType = TokenSemicolon
	case ',':
		tokenType = TokenComma
	case '&':
		tokenType = TokenAmpersand
	case '*':
		tokenType = TokenMult
	case '+':
		tokenType = TokenPlus
	case '-':
		tokenType = TokenMinus
	case '=':
		tokenType = TokenEquals
	case '.':
		tokenType = TokenDot
	case '!':
		tokenType = TokenExclamation
	case '/':
		tokenType = TokenSlash
	case '\\':
		tokenType = TokenBackslash
	case '#':
		tokenType = TokenHash
	case '@':
		tokenType = TokenAt
	case '%':
		tokenType = TokenPercent
	case '~':
		tokenType = TokenTilde
	case '?':
		tokenType = TokenQuestion
	default:
		return fmt.Errorf("unexpected character '%c' at line %d, column %d", ch, l.line, l.column)
	}

	l.addTokenAndAdvance(tokenType, string(ch), 1)
	return nil
}

// Helper methods

// current returns the current character
func (l *Lexer) current() rune {
	if l.position >= len(l.input) {
		return 0
	}
	return rune(l.input[l.position])
}

// peek returns the next character without advancing
func (l *Lexer) peek() rune {
	if l.position+1 >= len(l.input) {
		return 0
	}
	return rune(l.input[l.position+1])
}

// advance moves to the next character
func (l *Lexer) advance() {
	if l.position < len(l.input) {
		if l.input[l.position] == '\n' {
			l.line++
			l.column = 1
		} else {
			l.column++
		}
		l.position++
	}
}

// matchString checks if the input matches the given string at current position
func (l *Lexer) matchString(s string) bool {
	if l.position+len(s) > len(l.input) {
		return false
	}
	return l.input[l.position:l.position+len(s)] == s
}

// addToken adds a token to the token list
func (l *Lexer) addToken(tokenType TokenType, value string) {
	token := Token{
		Type:     tokenType,
		Value:    value,
		Line:     l.line,
		Column:   l.column - len(value),
		Position: l.position - len(value),
	}
	l.tokens = append(l.tokens, token)
}

// addTokenAndAdvance adds a token and advances position
func (l *Lexer) addTokenAndAdvance(tokenType TokenType, value string, length int) {
	l.addToken(tokenType, value)
	for i := 0; i < length; i++ {
		l.advance()
	}
}

// FilterTokens removes whitespace and comment tokens for parsing
func FilterTokens(tokens []Token) []Token {
	filtered := make([]Token, 0, len(tokens))
	for _, token := range tokens {
		if token.Type != TokenSpace && token.Type != TokenComment {
			filtered = append(filtered, token)
		}
	}
	return filtered
}