[PATCH v2] Improve performance of strstr

Wilco Dijkstra

2018-10-03 13:27:37 UTC

v2: Add documentation comment back in. Reduce size of shift table further to
gain another 10% performance on short strings.

This patch significantly improves performance of strstr by using Sunday's
Quick-Search algorithm. Due to its simplicity it has the best average
performance of string matching algorithms on almost all inputs. It uses a
bad-character shift table to skip past mismatches.
The needle length is limited to 254 - this reduces the shift table memory
4 to 8 times, lowering preprocessing overhead and minimizing cache effects.
The limit also implies its worst-case performance is linear.

Larger needles are processed by the Two-Way algorithm. The macro AVAILABLE
has been simplified as the haystack length is always precomputed. This
results in a 2.5 times speedup for large needles, reducing the performance
drop when the Quick-Search algorithm can't be used.

The code for small needles 1-4 bytes has been simplified and now uses unsigned
char. Since the optimized code relies on 8-bit chars, we defer to the
size-optimized implementation if CHAR_BIT > 8.

The performance gain of finding a set of randomly chosen words of size 8 in
256 bytes of English text is 14 times on AArch64. For longer haystacks the
gain is well over 20 times.

The size-optimized strstr has also been rewritten from scratch to improve
performance. On the same test the performance gain is 69%.

Tested against GLIBC testsuite and passes randomized tests.

---
diff --git a/newlib/libc/string/strstr.c b/newlib/libc/string/strstr.c
index e72b4bd9125f928486f52bb3ebd199eacef7cfaf..2c6d895087d61be52b379fc46907b4b29c73cef9 100644
--- a/newlib/libc/string/strstr.c
+++ b/newlib/libc/string/strstr.c
@@ -1,3 +1,31 @@
+/* Optimized strstr function.
+ Copyright (c) 2018 Arm Ltd. All rights reserved.
+
+ SPDX-License-Identifier: BSD-3-Clause
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ 3. The name of the company may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
/*
FUNCTION
<<strstr>>---find string segment
@@ -29,141 +57,134 @@ QUICKREF
*/

#include <string.h>
+#include <limits.h>
+
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) \
+ || CHAR_BIT > 8

-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+/* Small and efficient strstr implementation. */
char *
-strstr (const char *searchee,
- const char *lookfor)
+strstr (const char *hs, const char *ne)
{
- /* Less code size, but quadratic performance in the worst case. */
- if (*searchee == 0)
- {
- if (*lookfor)
- return (char *) NULL;
- return (char *) searchee;
- }
+ size_t i;
+ int c = ne[0];

- while (*searchee)
- {
- size_t i;
- i = 0;
+ if (c == 0)
+ return (char*)hs;

- while (1)
- {
- if (lookfor[i] == 0)
- {
- return (char *) searchee;
- }
-
- if (lookfor[i] != searchee[i])
- {
- break;
- }
- i++;
- }
- searchee++;
+ for ( ; hs[0] != '\0'; hs++)
+ {
+ if (hs[0] != c)
+ continue;
+ for (i = 1; ne[i] != 0; i++)
+ if (hs[i] != ne[i])
+ break;
+ if (ne[i] == '\0')
+ return (char*)hs;
}

- return (char *) NULL;
+ return NULL;
+}

#else /* compilation for speed */

# define RETURN_TYPE char *
-# define AVAILABLE(h, h_l, j, n_l) \
- (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \
- && ((h_l) = (j) + (n_l)))
+/* AVAILABLE assumes haystack length has been precomputed. */
+# define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
# include "str-two-way.h"

+/* Number of bits used to index shift table. */
+#define SHIFT_TABLE_BITS 6
+
static inline char *
-strstr2 (const char *hs, const char *ne)
+strstr2 (const unsigned char *hs, const unsigned char *ne)
{
uint32_t h1 = (ne[0] << 16) | ne[1];
uint32_t h2 = 0;
- int c = hs[0];
- while (h1 != h2 && c != 0)
- {
+ for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
h2 = (h2 << 16) | c;
- c = *++hs;
- }
return h1 == h2 ? (char *)hs - 2 : NULL;
}

static inline char *
-strstr3 (const char *hs, const char *ne)
+strstr3 (const unsigned char *hs, const unsigned char *ne)
{
uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
uint32_t h2 = 0;
- int c = hs[0];
- while (h1 != h2 && c != 0)
- {
+ for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
h2 = (h2 | c) << 8;
- c = *++hs;
- }
return h1 == h2 ? (char *)hs - 3 : NULL;
}

static inline char *
-strstr4 (const char *hs, const char *ne)
+strstr4 (const unsigned char *hs, const unsigned char *ne)
{
uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8) | ne[3];
uint32_t h2 = 0;
- int c = hs[0];
- while (h1 != h2 && c != 0)
- {
- h2 = (h2 << 8) | c;
- c = *++hs;
- }
+ for (int c = hs[0]; c != 0 && h1 != h2; c = *++hs)
+ h2 = (h2 << 8) | c;
return h1 == h2 ? (char *)hs - 4 : NULL;
}

+/* Extremely fast strstr algorithm with guaranteed linear-time performance.
+ Small needles up to size 4 use a dedicated linear search. Longer needles
+ up to size 254 use Sunday's Quick-Search algorithm. Due to its simplicity
+ it has the best average performance of string matching algorithms on almost
+ all inputs. It uses a bad-character shift table to skip past mismatches.
+ By limiting the needle length to 254, the shift table can be reduced to 8
+ bits per entry, lowering preprocessing overhead and minimizing cache effects.
+ The limit also implies the worst-case performance is linear.
+ Even larger needles are processed by the linear-time Two-Way algorithm.
+*/
char *
-strstr (const char *searchee,
- const char *lookfor)
+strstr (const char *haystack, const char *needle)
{
- /* Larger code size, but guaranteed linear performance. */
- const char *haystack = searchee;
- const char *needle = lookfor;
- size_t needle_len; /* Length of NEEDLE. */
- size_t haystack_len; /* Known minimum length of HAYSTACK. */
- int ok = 1; /* True if NEEDLE is prefix of HAYSTACK. */
+ const unsigned char *hs = (const unsigned char *) haystack;
+ const unsigned char *ne = (const unsigned char *) needle;

/* Handle short needle special cases first. */
- if (needle[0] == '\0')
- return (char *) haystack;
- if (needle[1] == '\0')
- return strchr (haystack, needle[0]);
- if (needle[2] == '\0')
- return strstr2 (haystack, needle);
- if (needle[3] == '\0')
- return strstr3 (haystack, needle);
- if (needle[4] == '\0')
- return strstr4 (haystack, needle);
-
- /* Determine length of NEEDLE, and in the process, make sure
- HAYSTACK is at least as long (no point processing all of a long
- NEEDLE if HAYSTACK is too short). */
- while (*haystack && *needle)
- ok &= *haystack++ == *needle++;
- if (*needle)
+ if (ne[0] == '\0')
+ return (char *) hs;
+ if (ne[1] == '\0')
+ return (char*)strchr (hs, ne[0]);
+ if (ne[2] == '\0')
+ return strstr2 (hs, ne);
+ if (ne[3] == '\0')
+ return strstr3 (hs, ne);
+ if (ne[4] == '\0')
+ return strstr4 (hs, ne);
+
+ size_t ne_len = strlen (ne);
+ size_t hs_len = strlen (hs);
+
+ /* Ensure haystack length is >= needle length. */
+ if (hs_len < ne_len)
return NULL;
- if (ok)
- return (char *) searchee;
-
- /* Reduce the size of haystack using strchr, since it has a smaller
- linear coefficient than the Two-Way algorithm. */
- needle_len = needle - lookfor;
- haystack = strchr (searchee + 1, *lookfor);
- if (!haystack || needle_len == 1)
- return (char *) haystack;
- haystack_len = (haystack > searchee + needle_len ? 1
- : needle_len + searchee - haystack);
-
- /* Perform the search. */
- if (needle_len < LONG_NEEDLE_THRESHOLD)
- return two_way_short_needle ((const unsigned char *) haystack,
- haystack_len,
- (const unsigned char *) lookfor, needle_len);
- return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
- (const unsigned char *) lookfor, needle_len);
-#endif /* compilation for speed */
+
+ /* Use the Quick-Search algorithm for needle lengths less than 255. */
+ if (__builtin_expect (ne_len < 255, 1))
+ {
+ uint8_t shift[1 << SHIFT_TABLE_BITS];
+ const unsigned char *end = hs + hs_len - ne_len;
+ hs--;
+
+ /* Initialize bad character shift hash table. */
+ memset (shift, ne_len + 1, sizeof (shift));
+ for (int i = 0; i < ne_len; i++)
+ shift[ne[i] % sizeof (shift)] = ne_len - i;
+
+ /* Search by skipping past bad characters. */
+ size_t tmp = shift[hs[ne_len] % sizeof (shift)];
+ for (hs += tmp; hs <= end; hs += tmp)
+ {
+ tmp = shift[hs[ne_len] % sizeof (shift)];
+ if (memcmp (hs, ne, ne_len) == 0)
+ return (char*) hs;
+ }
+ return NULL;
+ }
+
+ /* Use Two-Way algorithm for very long needles. */
+ return two_way_long_needle (hs, hs_len, ne, ne_len);
}
+#endif /* compilation for speed */