Making my Yacc sit up and beg
I'm more convinced than ever that the correct approach here is a GLR grammar, if at all possible. However, inspired by @Kaz, I produced the following yacc/bison grammar with an LALR(1) grammar (not even using precedence declarations).
Of course, it cheats, since the problem cannot be solved with an LALR(1) grammar. At appropriate intervals, it walks the constructed tree of IF THEN and IF THEN ELSE expressions, and moves the ELSE clauses as required.
Nodes which need to be re-examined for possible motion are given the AST nodetype IFSEQ and the ELSE clauses are attached with the traditional tightest match grammar, using a classic matched-if/unmatched-if grammar. A fully-matched IF THEN ELSE clause does not need to be rearranged; the tree rewrite will apply to the expression associated with the first ELSE whose right-hand operand is unmatched (if there is one). Keeping the fully-matched prefix of an IF expression separate from the tail which needs to be rearranged required almost-duplicating some rules; the almost-duplicated rules differ in that their actions directly produce TERNARY nodes instead if IFSEQ nodes.
In order to correctly answer the question, it would also be necessary to rearrange some IFF nodes, since the IFF binds more weakly than the THEN clause and more tightly than the ELSE clause. I think this means:
IF p THEN q IFF IF r THEN s ==> ((p → q) ↔ (r → s))
IF p THEN q IFF r ELSE s IFF t ==> (p ? (q ↔ r) : (s ↔ t))
IF p THEN q IFF IF r THEN s ELSE t IFF u ==> (p ? (q ↔ (r → s)) : (t ↔ u))
although I'm not sure that is what is being asked for (particularly the last one) and I really don't think it's a good idea. In the grammar below, if you want IFF to apply to an IF p THEN q subexpression, you will have to use parentheses; IF p THEN q IFF r produces p → (q ↔ r) and p IFF IF q THEN r is a syntax error.
Frankly, I think this whole thing would be easier using arrows for conditionals and biconditionals (as in the glosses above), and using IF THEN ELSE only for ternary selector expressions (written above with C-style ? : syntax, which is another possibility). That will generate far fewer surprises. But it's not my language.
One solution for the biconditional operator with floating precedence would be to parse in two passes. The first pass would only identify the IF p THEN q operators without an attached ELSE, using a mechanism similar to the one proposed here, and change them to p -> q by deleting the IF and changing the spelling of THEN. Other operators would not be parsed and parentheses would be retained. It would then feed to resulting token stream into a second LALR parser with a more traditional grammar style. I might get around to coding that only because I think that two-pass bison parsers are occasionally useful and there are few examples floating around.
Here's the tree-rewriting parser. I apologise for the length:
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void yyerror(const char* msg);
int yylex(void);
typedef struct Node Node;
enum AstType { ATOM, NEG, CONJ, DISJ, IMPL, BICOND, TERNARY,
IFSEQ
};
struct Node {
enum AstType type;
union {
const char* atom;
Node* child[3];
};
};
Node* node(enum AstType type, Node* op1, Node* op2, Node* op3);
Node* atom(const char* name);
void node_free(Node*);
void node_print(Node*, FILE*);
typedef struct ElseStack ElseStack;
struct ElseStack {
Node* action;
ElseStack* next;
};
ElseStack* build_else_stack(Node*, ElseStack*);
ElseStack* shift_elses(Node*, ElseStack*);
%}
%union {
const char* name;
struct Node* node;
}
%token <name> T_ID
%token T_AND "and"
T_ELSE "else"
T_IF "if"
T_IFF "iff"
T_NOT "not"
T_OR "or"
T_THEN "then"
%type <node> term conj disj bicond cond mat unmat tail expr
%%
prog : %empty | prog stmt;
stmt : expr '\n' { node_print($1, stdout); putchar('\n'); node_free($1); }
| '\n'
| error '\n'
term : T_ID { $$ = atom($1); }
| "not" term { $$ = node(NEG, $2, NULL, NULL); }
| '(' expr ')' { $$ = $2; }
conj : term
| conj "and" term { $$ = node(CONJ, $1, $3, NULL); }
disj : conj
| disj "or" conj { $$ = node(DISJ, $1, $3, NULL); }
bicond: disj
| disj "iff" bicond { $$ = node(BICOND, $1, $3, NULL); }
mat : bicond
| "if" expr "then" mat "else" mat
{ $$ = node(IFSEQ, $2, $4, $6); }
unmat: "if" expr "then" mat
{ $$ = node(IFSEQ, $2, $4, NULL); }
| "if" expr "then" unmat
{ $$ = node(IFSEQ, $2, $4, NULL); }
| "if" expr "then" mat "else" unmat
{ $$ = node(IFSEQ, $2, $4, $6); }
tail : "if" expr "then" mat
{ $$ = node(IFSEQ, $2, $4, NULL); }
| "if" expr "then" unmat
{ $$ = node(IFSEQ, $2, $4, NULL); }
cond : bicond
| tail { shift_elses($$, build_else_stack($$, NULL)); }
| "if" expr "then" mat "else" cond
{ $$ = node(TERNARY, $2, $4, $6); }
expr : cond
%%
/* Walk the IFSEQ nodes in the tree, pushing any
* else clause found onto the else stack, which it
* returns.
*/
ElseStack* build_else_stack(Node* ifs, ElseStack* stack) {
if (ifs && ifs->type != IFSEQ) {
stack = build_else_stack(ifs->child[1], stack);
if (ifs->child[2]) {
ElseStack* top = malloc(sizeof *top);
*top = (ElseStack) { ifs->child[2], stack };
stack = build_else_stack(ifs->child[2], top);
}
}
return stack;
}
/* Walk the IFSEQ nodes in the tree, attaching elses from
* the else stack.
* Pops the else stack as it goes, freeing popped
* objects, and returns the new top of the stack.
*/
ElseStack* shift_elses(Node* n, ElseStack* stack) {
if (n && n->type == IFSEQ) {
if (stack) {
ElseStack* top = stack;
stack = shift_elses(n->child[2],
shift_elses(n->child[1], stack->next));
n->type = TERNARY;
n->child[2] = top;
free(top);
}
else {
shift_elses(n->child[2],
shift_elses(n->child[1], NULL));
n->type = IMPL;
n->child[2] = NULL;
}
}
return stack;
}
Node* node(enum AstType type, Node* op1, Node* op2, Node* op3) {
Node* rv = malloc(sizeof *rv);
*rv = (Node){type, .child = {op1, op2, op3}};
return rv;
}
Node* atom(const char* name) {
Node* rv = malloc(sizeof *rv);
*rv = (Node){ATOM, .atom = name};
return rv;
}
void node_free(Node* n) {
if (n) {
if (n->type == ATOM) free((char*)n->atom);
else for (int i = 0; i < 3; ++i) node_free(n->child[i]);
free(n);
}
}
const char* typename(enum AstType type) {
switch (type) {
case ATOM: return "ATOM";
case NEG: return "NOT" ;
case CONJ: return "CONJ";
case DISJ: return "DISJ";
case IMPL: return "IMPL";
case BICOND: return "BICOND";
case TERNARY: return "TERNARY" ;
case IFSEQ: return "IF_SEQ";
}
return "**BAD NODE TYPE**";
}
void node_print(Node* n, FILE* out) {
if (n) {
if (n->type == ATOM)
fputs(n->atom, out);
else {
fprintf(out, "(%s", typename(n->type));
for (int i = 0; i < 3 && n->child[i]; ++i) {
fputc(' ', out); node_print(n->child[i], out);
}
fputc(')', out);
}
}
}
void yyerror(const char* msg) {
fprintf(stderr, "%s\n", msg);
}
int main(int argc, char** argv) {
return yyparse();
}
The lexer is almost trivial. (This one uses lower-case keywords because my fingers prefer that, but it's trivial to change.)
%{
#include "ifelse.tab.h"
%}
%option noinput nounput noyywrap nodefault
%%
and { return T_AND; }
else { return T_ELSE; }
if { return T_IF; }
iff { return T_IFF; }
not { return T_NOT; }
or { return T_OR; }
then { return T_THEN; }
[[:alpha:]]+ { yylval.name = strdup(yytext);
return T_ID; }
([[:space:]]{-}[\n])+ ;
\n { return '\n'; }
. { return *yytext;}
As written, the parser/lexer reads a line at a time, and prints the AST for each line (so multiline expressions aren't allowed). I hope it's clear how to change it.