[PATCH] ash: improve speed of variable pattern substitution

Ron Yorston rmy at pobox.com
Wed Jul 21 12:53:58 UTC 2021


Bash pattern substitution of variables (${var/find/replace}) was
found to take time O(N^2) (or worse).  Performance can be improved
considerably if the 'find' string doesn't contain any wildcard
characters.

Detect if this is the case, remove the backslashes added to
the pattern to protect literal references to wildcards and call
a simpler replacement function to search for the pattern.

Like a similar optimisation for patterns that end with '*' this
implementation is controlled by the ASH_OPTIMIZE_FOR_SIZE option.
Since this is enabled in the default configuration there's no
effect on the size of the binary.

When ASH_OPTIMIZE_FOR_SIZE is disabled:

function                                             old     new   delta
subevalvar                                          1541    1708    +167
hasmeta                                                -     128    +128
expandarg                                           1099     987    -112
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 1/1 up/down: 295/-112)          Total: 183 bytes

Signed-off-by: Ron Yorston <rmy at pobox.com>
---
 shell/ash.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/shell/ash.c b/shell/ash.c
index 2eac6e113..18d60baf2 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -7024,6 +7024,27 @@ scanright(char *startp, char *rmesc, char *rmescend,
 	return NULL;
 }
 
+#if BASH_PATTERN_SUBST && !ENABLE_ASH_OPTIMIZE_FOR_SIZE
+static int hasmeta(const char *p);
+
+/*
+ * Like scanright() but for patterns that don't require the use of fnmatch().
+ */
+static char *
+scannoglob(char *startp, char *rmesc, char *pattern, size_t len)
+{
+	int match = strncmp(rmesc, pattern, len) == 0;
+	//bb_error_msg("strncmp(s:'%s',pattern:'%s',len:%lu):%d", rmesc, pattern, len, match);
+	if (match) {
+		while (len--)
+			if (*startp++ == CTLESC)
+				startp++;
+		return startp;
+	}
+	return NULL;
+}
+#endif
+
 static void varunset(const char *, const char *, const char *, int) NORETURN;
 static void
 varunset(const char *end, const char *var, const char *umsg, int varflags)
@@ -7260,6 +7281,12 @@ subevalvar(char *start, char *str, int strloc,
 	if (subtype == VSREPLACE || subtype == VSREPLACEALL) {
 		int len;
 		char *idx, *end;
+# if !ENABLE_ASH_OPTIMIZE_FOR_SIZE
+		int isglob;
+		size_t patlen;
+# else
+		const int isglob = 1;
+# endif
 
 		if (!repl) {
 			//bb_error_msg("str9:'%s' slash_pos:%d", str, slash_pos);
@@ -7275,12 +7302,35 @@ subevalvar(char *start, char *str, int strloc,
 		if (str[0] == '\0')
 			goto out1;
 
+# if !ENABLE_ASH_OPTIMIZE_FOR_SIZE
+		isglob = hasmeta(str);
+		if (!isglob) {
+			/* If the pattern doesn't have any special characters remove
+			 * the backslash escapes that were added by rmescapes(). */
+			char *s, *t;
+			s = t = str;
+			while (*t) {
+				if (*t == '\\')
+					if (*++t == '\0')
+						break;
+				*s++ = *t++;
+			}
+			*s = '\0';
+			patlen = strlen(str);
+		}
+# endif
+
 		len = 0;
 		idx = startp;
 		end = str - 1;
 		while (idx <= end) {
  try_to_match:
-			loc = scanright(idx, rmesc, rmescend, str, quotes, 1);
+			if (isglob)
+				loc = scanright(idx, rmesc, rmescend, str, quotes, 1);
+# if !ENABLE_ASH_OPTIMIZE_FOR_SIZE
+			else
+				loc = scannoglob(idx, rmesc, str, patlen);
+# endif
 			//bb_error_msg("scanright('%s'):'%s'", str, loc);
 			if (!loc) {
 				/* No match, advance */
@@ -7300,7 +7350,7 @@ subevalvar(char *start, char *str, int strloc,
 				len++;
 				rmesc++;
 				/* continue; - prone to quadratic behavior, smarter code: */
-				if (str[0] == '*') {
+				if (isglob && str[0] == '*') {
 					/* Pattern is "*foo". If "*foo" does not match "long_string",
 					 * it would never match "ong_string" etc, no point in trying.
 					 */
-- 
2.31.1



More information about the busybox mailing list