# -*- tcl -*-
#
# Copyright (c) 2005 by Andreas Kupries <andreas_kupries@users.sourceforge.net>
# Grammars / Parsing Expression Grammars / Container
# ### ### ### ######### ######### #########
## Package description
# A class whose instances hold all the information describing a single
# parsing expression grammar (terminal symbols, nonterminal symbols,
# nonterminal rules, start expression, hints), and operations to
# define, manipulate, and query this information.
#
# The container has only one functionality beyond the simple storage
# of the aforementioned information. It keeps track if the provided
# grammar is valid (*). The container provides no higher-level
# operations on the grammar, like removal of unreachable nonterminals,
# rule rewriting, etc.
#
# The set of terminal symbols is the set of characters (i.e.
# implicitly defined). For Tcl this means that all the unicode
# characters are supported.
#
# (*) A grammar is valid if and only if all its rules are valid. A
# rule is valid if and only if all nonterminals referenced by the RHS
# of the rule are in the set of nonterminals, and if only the allowed
# operators are used in the expression.
# ### ### ### ######### ######### #########
## Requisites
package require snit ; # Tcllib | OO system used
# ### ### ### ######### ######### #########
## Implementation
snit::type ::grammar::peg {
# ### ### ### ######### ######### #########
## Type API. Helpful methods for PEs.
proc ValidateSerial {e prefix} {}
proc Validate {e} {}
proc References {e} {}
proc Rename {e old new} {}
# ### ### ### ######### ######### #########
## Instance API
constructor {args} {}
method clear {} {}
method = {src} {}
method --> {dst} {}
method serialize {} {}
method deserialize {value} {}
method {is valid} {} {}
method start {args} {}
method nonterminals {} {}
method {nonterminal add} {nts pae} {}
method {nonterminal delete} {nts pae} {}
method {nonterminal exists} {nts} {}
method {nonterminal rename} {ntsold ntsnew} {}
method {nonterminal mode} {nts args} {}
method {unknown nonterminals} {} {}
method {nonterminal rule} {nts} {}
# ### ### ### ######### ######### #########
## Internal data structures.
## - Set of nonterminal symbols, and
## - Mapping from nonterminals to their defining parsing
## expressions, and
## - Start parsing expression.
## - And usage of nonterminals by others, required for tracking
## of validity.
## se: expression | Start expression
## nt: nonterm -> expression | Known Nt's, their rules
## re: nonterm -> list(nonterm) | Known Nt's, what others they use.
## ir: nonterm -> list(nonterm) | Nt's, possibly unknown, their users.
## uk: nonterm -> use counter | Nt's which are unknown.
##
## Both 're' and 'ir' can list a nonterminal A multiple times,
## if it uses or is used multiple times.
##
## Grammar is invalid <=> '[array size uk] > 0'
variable se epsilon
variable nt -array {}
variable re -array {}
variable ir -array {}
variable uk -array {}
variable mo -array {}
# ### ### ### ######### ######### #########
## Instance API Implementation.
constructor {args} {
if {
(([llength $args] != 0) && ([llength $args] != 2)) ||
(([llength $args] == 2) && ([lsearch {= := <-- as deserialize} [lindex $args 0]]) < 0)
} {
return -code error "wrong#args: $self ?=|:=|<--|as|deserialize a'?"
}
# Serialization arguments.
# [llength args] in {0 2}
#
# = src-obj
# := src-obj
# <-- src-obj
# as src-obj
# deserialize src-value
if {[llength $args] == 2} {
foreach {op val} $args break
switch -exact -- $op {
= - := - <-- - as {
$self deserialize [$val serialize]
}
deserialize {
$self deserialize $val
}
}
}
return
}
#destructor {}
method clear {} {
array unset nt *
array unset re *
array unset ir *
array unset uk *
array unset mo *
set se epsilon
return
}
method = {src} {
$self dserialize [$src serialize]
}
method --> {dst} {
$dst deserialize [$self serialize]
}
method serialize {} {
return [::list \
grammar::pegc \
[array get nt] \
[array get mo] \
$se]
}
method deserialize {value} {
# Validate value, then clear and refill.
$self CheckSerialization $value ntv mov sev
$self clear
foreach {s e} $ntv {
$self NtAdd $s $e
}
array set mo $mov
$self start $sev
return
}
method {is valid} {} {
return [expr {[array size uk] == 0}]
}
method start {args} {
if {[llength $args] == 0} {
return $se
}
if {[llength $args] > 1} {
return -code error "wrong#args: $self start ?pe?"
}
set newse [lindex $args 0]
Validate $newse
set se $newse
return
}
method nonterminals {} {
return [array names nt]
}
method {nonterminal add} {nts pae} {
$self CheckNtKnown $nts
Validate $pae
$self NtAdd $nts $pae
return
}
method {nonterminal mode} {nts args} {
$self CheckNt $nts
if {![llength $args]} {
return $mo($nts)
} elseif {[llength $args] == 1} {
set mo($nts) [lindex $args 0]
return
} else {
return -code error "wrong#args"
}
return
}
method {nonterminal delete} {nts args} {
set args [linsert $args 0 $nts]
foreach nts $args {
$self CheckNt $nts
}
foreach nts $args {
$self NtDelete $nts
}
return
}
method {nonterminal exists} {nts} {
return [info exists nt($nts)]
}
method {nonterminal rename} {ntsold ntsnew} {
$self CheckNt $ntsold
$self CheckNtKnown $ntsnew
# Difficult. We have to go through all rules and rewrite their
# RHS to use the new name of the nonterminal. We can however
# restrict ourselves to the rules which actually use the
# changed nonterminal.
# We also have to update the used/user information. We know
# that the validity of the grammar is unchanged by this
# operation. The unknown information is unchanged as well, as
# we cannot rename an unknown nonterminal. IOW we know that
# 'ntsold' is not in 'uk', and so 'ntsnew' will not be in that
# array either after the rename.
set myusers $ir($ntsold)
set myused $re($ntsold)
set nt($ntsnew) $nt($ntsold)
unset nt($ntsold)
set mo($ntsnew) $mo($ntsold)
unset mo($ntsold)
foreach x $myusers {
set nt($x) [Rename $nt($x) $ntsold $ntsnew]
}
# It is possible to use myself, and be used by myself.
while {[set pos [lsearch -exact $myusers $ntsold]] >= 0} {
set myusers [lreplace $myusers $pos $pos $ntsnew]
}
while {[set pos [lsearch -exact $myused $ntsold]] >= 0} {
set myused [lreplace $myused $pos $pos $ntsnew]
}
set re($ntsnew) $myusers
set ir($ntsnew) $myused
unset re($ntsold)
unset ir($ntsold)
return
}
method {unknown nonterminals} {} {
return [array names uk]
}
method {nonterminal rule} {nts} {
$self CheckNt $nts
return $nt($nts)
}
# ### ### ### ######### ######### #########
## Internal helper methods
method NtAdd {nts pae} {
# None of the symbols is known. We can add them to the
# grammar. If however any of their PEs is known to the PE
# storage then we had expressions refering to unknown
# symbols. The grammar is most certainly invalid and may have
# become valid right now. We have to invalidate the validity
# cache.
set nt($nts) $pae
set mo($nts) value
# Track users, uses, and unknowns.
set references [References $pae]
# We use the refered symbols
set re($nts) $references
# We are a user for the refered symbols
# Record unknown symbols immediately.
foreach x $references {
lappend ir($x) $nts
if {[info exists nt($x)]} continue
if {[catch {incr uk($x)}]} {set uk($x) 1}
}
# We are definitely not unknown.
unset -nocomplain uk($nts)
return
}
method NtDelete {nts} {
set references $re($nt)
# We are gone. We are not using anything anymore.
unset nt($nts)
unset re($nts)
unset mo($nts)
# Our references loose us as their user.
foreach x $references {
set pos [lsearch -exact $ir($x) $x]
if {$pos < 0} {error PANIC}
set ir($x) [lreplace $ir($x) $pos $pos]
if {[llength $ir($x)] == 0} {
unset ir($x)
# x is not referenced anywhere, cannot be unknown.
unset -nocomplain uk($x)
}
if {[info exists uk($x)]} {
incr uk($x) -1
}
}
# We might be used by others still, and therefore become
# unknown.
if {[info exists ir($nts]} {
set uk($nts) [llength $ir($nts)]
}
return
}
method CheckNt {nts} {
if {![info exists nt($nts)]} {
return -code error "Invalid nonterminal \"$nts\""
}
return
}
method CheckNtKnown {nts} {
if {[info exists nt($nts)]} {
return -code error "Nonterminal \"$nts\" is already known"
}
return
}
method CheckSerialization {value ntv mov sev} {
# value is list/3 ('grammar::pegc' nonterminals start)
# terminals is list of string.
# nonterminals is doct (key is string, value is expr)
# start is expr
# terminals * nonterminals == empty
# expr is parsing expression (Validate PE).
upvar 1 \
$ntv ntvs \
$mov movs \
$sev sevs
set prefix "error in serialization:"
if {[llength $value] != 4} {
return -code error "$prefix list length not 4"
}
struct::list assign $value type nonterminals hints start
if {$type ne "grammar::pegc"} {
return -code error "$prefix unknown type \"$type\""
}
ValidateSerial $start "$prefix invalid start expression"
if {[llength $nonterminals] % 2 == 1} {
return -code error "$prefix nonterminal data is not a dictionary"
}
array set _nt $nonterminals
if {[llength $nonterminals] != (2*[array size _nt])} {
return -code error "$prefix nonterminal data contains duplicate names, or misses some"
}
foreach {s e} $nonterminals {
ValidateSerial $start "$prefix nonterminal \"$s\", invalid parsing expression"
}
if {[llength $hints] % 2 == 1} {
return -code error "$prefix nonterminal modes is not a dictionary"
}
array set _mo $hints
if {[llength $hints] != (2*[array size _mo])} {
return -code error "$prefix nonterminal modes contains duplicate names, or misses some"
}
foreach {s _} $hints {
if {![info exists _nt($s)]} {
return -code error "$prefix nonterminal mode for unknown nonterminal \"$s\""
}
}
set ntvs $nonterminals
set sevs $start
set movs $hints
return
}
# ### ### ### ######### ######### #########
# ### ### ### ######### ######### #########
## Type API implementation.
proc ValidateSerial {e prefix} {
if {![catch {Validate $e} msg]} return
return -code error "$prefix, $msg"
}
proc Validate {e} {
if {[llength $e] == 0} {
return -code error "invalid empty expression list"
}
set op [lindex $e 0]
set ar [lrange $e 1 end]
switch -exact -- $op {
epsilon - alpha - alnum - dot {
if {[llength $ar] > 0} {
return -code error "wrong#args for \"$op\""
}
}
.. {
if {[llength $ar] != 2} {
return -code error "wrong#args for \"$op\""
}
# Leaf, arguments are not expressions to validate.
}
n - t {
if {[llength $ar] != 1} {
return -code error "wrong#args for \"$op\""
}
# Leaf, argument is not expression to validate.
}
& - ! - * - + - ? {
if {[llength $ar] != 1} {
return -code error "wrong#args for \"$op\""
}
Validate [lindex $ar 0]
}
x - / {
if {![llength $ar]} {
return -code error "wrong#args for \"$op\""
}
foreach e $ar {
Validate $e
}
}
default {
return -code error "invalid operator \"$op\""
}
}
}
proc References {e} {
set references {}
set op [lindex $e 0]
set ar [lrange $e 1 end]
switch -exact -- $op {
epsilon - t - alpha - alnum - dot - .. {}
n {
# Remember referenced nonterminal
lappend references [lindex $ar 0]
}
& - ! - * - + - ? {
foreach r [References [lindex $ar 0]] {
lappend references $r
}
}
x - / {
foreach e $ar {
foreach r [References $e] {
lappend references $r
}
}
}
}
return $references
}
proc Rename {e old new} {
set op [lindex $e 0]
set ar [lrange $e 1 end]
switch -exact -- $op {
epsilon - t - alpha - alnum - dot - .. {return $e}
n {
if {[lindex $ar 0] ne $old} {return $e}
return [list n $new]
}
& - ! - * - + - ? {
return [list $op [Rename [lindex $ar 0] $old $new]]
}
x - / {
set res $op
foreach e $ar {
lappend res [Rename $e $old $new]
}
return $res
}
}
}
# ### ### ### ######### ######### #########
## Type Internals.
# ### ### ### ######### ######### #########
}
# ### ### ### ######### ######### #########
## Package Management
package provide grammar::peg 0.1