[git commit] awk: tighten rules in action parsing

Denys Vlasenko vda.linux at googlemail.com
Fri Jul 2 23:16:48 UTC 2021


commit: https://git.busybox.net/busybox/commit/?id=2b65e73db3254a7228802886546152c72217017d
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master

Disallow:
    BEGIN
	{ action }  - must start on the same line
Disallow:
    func f()
	print "hello" - must be in {...}

function                                             old     new   delta
chain_until_rbrace                                     -      41     +41
parse_program                                        307     336     +29
chain_group                                          649     616     -33
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 1/1 up/down: 70/-33)             Total: 37 bytes

Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
 editors/awk.c | 108 +++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 42 deletions(-)

diff --git a/editors/awk.c b/editors/awk.c
index a1a2afd1d..c68416873 100644
--- a/editors/awk.c
+++ b/editors/awk.c
@@ -1549,29 +1549,35 @@ static node *chain_loop(node *nn)
 	return n;
 }
 
+static void chain_until_rbrace(void)
+{
+	uint32_t tc;
+	while ((tc = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) {
+		debug_printf_parse("%s: !TC_RBRACE\n", __func__);
+		if (tc == TC_NEWLINE)
+			continue;
+		rollback_token();
+		chain_group();
+	}
+	debug_printf_parse("%s: TC_RBRACE\n", __func__);
+}
+
 /* parse group and attach it to chain */
 static void chain_group(void)
 {
-	uint32_t c;
+	uint32_t tc;
 	node *n, *n2, *n3;
 
 	do {
-		c = next_token(TS_GRPSEQ);
-	} while (c & TC_NEWLINE);
+		tc = next_token(TS_GRPSEQ);
+	} while (tc == TC_NEWLINE);
 
-	if (c & TC_LBRACE) {
+	if (tc == TC_LBRACE) {
 		debug_printf_parse("%s: TC_LBRACE\n", __func__);
-		while ((c = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) {
-			debug_printf_parse("%s: !TC_RBRACE\n", __func__);
-			if (c & TC_NEWLINE)
-				continue;
-			rollback_token();
-			chain_group();
-		}
-		debug_printf_parse("%s: TC_RBRACE\n", __func__);
+		chain_until_rbrace();
 		return;
 	}
-	if (c & (TS_OPSEQ | TS_OPTERM)) {
+	if (tc & (TS_OPSEQ | TS_OPTERM)) {
 		debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__);
 		rollback_token();
 		chain_expr(OC_EXEC | Vx);
@@ -1675,37 +1681,48 @@ static void chain_group(void)
 
 static void parse_program(char *p)
 {
-	uint32_t tclass;
-	node *cn;
-	func *f;
-	var *v;
-
 	debug_printf_parse("%s()\n", __func__);
 
 	g_pos = p;
 	t_lineno = 1;
-	while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE |
-			TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) {
+	for (;;) {
+		uint32_t tclass;
 
-		if (tclass & TS_OPTERM) {
+		tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE |
+			TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL);
+
+		if (tclass == TC_EOF) {
+			debug_printf_parse("%s: TC_EOF\n", __func__);
+			break;
+		}
+		if (tclass & TS_OPTERM) { /* ; or <newline> */
 			debug_printf_parse("%s: TS_OPTERM\n", __func__);
+//NB: gawk allows many newlines, but does not allow more than one semicolon:
+//  BEGIN {...}<newline>;<newline>;
+//would complain "each rule must have a pattern or an action part".
+//Same message for
+//  ; BEGIN {...}
 			continue;
 		}
-
-		seq = &mainseq;
-		if (tclass & TC_BEGIN) {
+		if (tclass == TC_BEGIN) {
 			debug_printf_parse("%s: TC_BEGIN\n", __func__);
 			seq = &beginseq;
-//TODO: ensure there is no newline between BEGIN and {
-//next_token(TC_LBRACE); rollback_token();
-			chain_group();
-		} else if (tclass & TC_END) {
+			/* ensure there is no newline between BEGIN and { */
+			next_token(TC_LBRACE);
+			chain_until_rbrace();
+			continue;
+		}
+		if (tclass == TC_END) {
 			debug_printf_parse("%s: TC_END\n", __func__);
 			seq = &endseq;
-//TODO: ensure there is no newline between END and {
-//next_token(TC_LBRACE); rollback_token();
-			chain_group();
-		} else if (tclass & TC_FUNCDECL) {
+			/* ensure there is no newline between END and { */
+			next_token(TC_LBRACE);
+			chain_until_rbrace();
+			continue;
+		}
+		if (tclass == TC_FUNCDECL) {
+			func *f;
+
 			debug_printf_parse("%s: TC_FUNCDECL\n", __func__);
 			next_token(TC_FUNCTION);
 			f = newfunc(t_string);
@@ -1716,6 +1733,7 @@ static void parse_program(char *p)
 			//f->nargs = 0; - already is
 			/* func arg list: comma sep list of args, and a close paren */
 			for (;;) {
+				var *v;
 				if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) {
 					if (f->nargs == 0)
 						break; /* func() is ok */
@@ -1730,31 +1748,37 @@ static void parse_program(char *p)
 				/* it was a comma, we ate it */
 			}
 			seq = &f->body;
-//TODO: ensure there is { after "func F(...)" - but newlines are allowed
-//while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE) continue; rollback_token();
-			chain_group();
+			/* ensure there is { after "func F(...)" - but newlines are allowed */
+			while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE)
+				continue;
+			chain_until_rbrace();
 			hash_clear(ahash);
-		} else if (tclass & TS_OPSEQ) {
+			continue;
+		}
+		seq = &mainseq;
+		if (tclass & TS_OPSEQ) {
+			node *cn;
+
 			debug_printf_parse("%s: TS_OPSEQ\n", __func__);
 			rollback_token();
 			cn = chain_node(OC_TEST);
 			cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_LBRACE);
-			if (t_tclass & TC_LBRACE) {
+			if (t_tclass == TC_LBRACE) {
 				debug_printf_parse("%s: TC_LBRACE\n", __func__);
 				rollback_token();
 				chain_group();
 			} else {
+				/* no action, assume default "{ print }" */
 				debug_printf_parse("%s: !TC_LBRACE\n", __func__);
 				chain_node(OC_PRINT);
 			}
 			cn->r.n = mainseq.last;
-		} else /* if (tclass & TC_LBRACE) */ {
-			debug_printf_parse("%s: TC_LBRACE(?)\n", __func__);
-			rollback_token();
-			chain_group();
+			continue;
 		}
+		/* tclass == TC_LBRACE */
+		debug_printf_parse("%s: TC_LBRACE(?)\n", __func__);
+		chain_until_rbrace();
 	}
-	debug_printf_parse("%s: TC_EOF\n", __func__);
 }
 
 


More information about the busybox-cvs mailing list