Skip to content

Commit

Permalink
Use a branchless binary search for extension<->MIME-Types
Browse files Browse the repository at this point in the history
This uses the excellent branchless binary search by Orlon Peters[1],
based on the Malte Skarupke's version[2] of Leonard E Shar's version of
a binary search.  It's a fascinating implementation that ends up
becoming just a very tight loop using CMOV instructions, and finishing
with another CMOV instruction.

It's so clean that I removed the fast path using STRING_SWITCH as
that's not necessary anymore.  A nice side effect of this change is
that the string "application/octet-stream" doesn't appear in the binary
anymore (except in the debug version due to assertions), as it's now
part of the compressed blob that mimegen generates.

A good segue to this commit would be porting the other usage of
bsearch() to this implementation.

[1] https://orlp.net/blog/bitwise-binary-search/
[2] https://probablydance.com/2023/04/27/beautiful-branchless-binary-search/
  • Loading branch information
lpereira committed May 14, 2024
1 parent f9eedda commit 5d77176
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 58 deletions.
59 changes: 31 additions & 28 deletions src/bin/tools/mimegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,29 +161,6 @@ static char *compress_output(const struct output *output, size_t *outlen)
return compressed;
}

static bool is_builtin_ext(const char *ext)
{
/* STRING_SWITCH_L() is not used here to not bring in lwan.h */
/* FIXME: maybe use an X-macro to keep in sync with lwan-tables.c? */
if (strcaseequal_neutral(ext, "css"))
return true;
if (strcaseequal_neutral(ext, "gif"))
return true;
if (strcaseequal_neutral(ext, "htm"))
return true;
if (strcaseequal_neutral(ext, "html"))
return true;
if (strcaseequal_neutral(ext, "jpg"))
return true;
if (strcaseequal_neutral(ext, "js"))
return true;
if (strcaseequal_neutral(ext, "png"))
return true;
if (strcaseequal_neutral(ext, "txt"))
return true;
return false;
}

int main(int argc, char *argv[])
{
/* 32k is sufficient for the provided mime.types, but we can reallocate
Expand Down Expand Up @@ -258,11 +235,6 @@ int main(int argc, char *argv[])
ext[8] = '\0';
}

/* Lwan has a fast-path for some common extensions, so don't bundle them
* in this table if not really needed. */
if (is_builtin_ext(ext))
continue;

k = strdup(ext);
v = strdup(mime_type);

Expand All @@ -286,6 +258,22 @@ int main(int argc, char *argv[])
}
}

{
char *k = strdup("bin");
char *v = strdup("application/octet-stream");
if (!k || !v) {
fprintf(stderr, "Could not allocate memory\n");
fclose(fp);
return 1;
}
int r = hash_add_unique(ext_mime, k, v);
if (r != 0 && r != -EEXIST) {
fprintf(stderr, "Could not add fallback mime entry\n");
fclose(fp);
return 1;
}
}

/* Get sorted list of extensions. */
exts = calloc(hash_get_count(ext_mime), sizeof(char *));
if (!exts) {
Expand All @@ -305,6 +293,7 @@ int main(int argc, char *argv[])
fclose(fp);
return 1;
}
ssize_t bin_index = -1;
for (i = 0; i < hash_get_count(ext_mime); i++) {
uint64_t ext_lower = 0;

Expand All @@ -322,6 +311,9 @@ int main(int argc, char *argv[])
fclose(fp);
return 1;
}

if (bin_index < 0 && streq(exts[i], "bin"))
bin_index = (ssize_t)i;
}
for (i = 0; i < hash_get_count(ext_mime); i++) {
if (output_append(&output, hash_find(ext_mime, exts[i])) < 0) {
Expand All @@ -331,6 +323,12 @@ int main(int argc, char *argv[])
}
}

if (bin_index < 0) {
fprintf(stderr, "Could not find fallback item after sorting!\n");
fclose(fp);
return 1;
}

/* Compress blob. */
compressed = compress_output(&output, &compressed_size);
if (!compressed) {
Expand All @@ -349,10 +347,15 @@ int main(int argc, char *argv[])
#else
printf("/* Compressed with zlib (deflate) */\n");
#endif

unsigned int entries_floor = 1u << (31 - __builtin_clz(hash_get_count(ext_mime)));

printf("#pragma once\n");
printf("#define MIME_UNCOMPRESSED_LEN %zu\n", output.used);
printf("#define MIME_COMPRESSED_LEN %lu\n", compressed_size);
printf("#define MIME_ENTRIES %d\n", hash_get_count(ext_mime));
printf("#define MIME_ENTRIES_FLOOR %d\n", entries_floor);
printf("#define MIME_ENTRY_FALLBACK %ld\n", bin_index);
printf("static const unsigned char mime_entries_compressed[] = {\n");
for (i = 1; compressed_size; compressed_size--, i++)
printf("0x%02x,%c", compressed[i - 1] & 0xff, " \n"[i % 13 == 0]);
Expand Down
47 changes: 17 additions & 30 deletions src/lib/lwan-tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

static unsigned char uncompressed_mime_entries[MIME_UNCOMPRESSED_LEN];
static char *mime_types[MIME_ENTRIES];
static uint64_t *mime_extensions;
static bool mime_entries_initialized = false;

void lwan_tables_shutdown(void)
Expand Down Expand Up @@ -86,6 +87,7 @@ void lwan_tables_init(void)
mime_types[i] = (char *)ptr;
ptr += strlen((const char *)ptr) + 1;
}
mime_extensions = (uint64_t *)uncompressed_mime_entries;

mime_entries_initialized = true;

Expand Down Expand Up @@ -120,34 +122,25 @@ LWAN_SELF_TEST(status_codes)
#undef ASSERT_STATUS
}

static int compare_mime_entry(const void *a, const void *b)
static ALWAYS_INLINE const char *bsearch_mime_type(uint64_t ext)
{
const uint64_t exta = string_as_uint64((const char *)a);
const uint64_t extb = string_as_uint64((const char *)b);

return (exta > extb) - (exta < extb);
/* Based on https://orlp.net/blog/bitwise-binary-search/ */
int64_t b = ext > mime_extensions[MIME_ENTRIES / 2]
? MIME_ENTRIES - MIME_ENTRIES_FLOOR
: -1;
for (uint64_t bit = MIME_ENTRIES_FLOOR >> 1; bit != 0; bit >>= 1) {
if (ext > mime_extensions[b + (int64_t)bit])
b += (int64_t)bit;
}
return mime_types[mime_extensions[b + 1] == ext ? b + 1
: MIME_ENTRY_FALLBACK];
}

const char *
lwan_determine_mime_type_for_file_name(const char *file_name)
const char *lwan_determine_mime_type_for_file_name(const char *file_name)
{
char *last_dot = strrchr(file_name, '.');
if (UNLIKELY(!last_dot))
goto fallback;

STRING_SWITCH_L(last_dot) {
case STR4_INT_L('.','c','s','s'): return "text/css";
case STR4_INT_L('.','g','i','f'): return "image/gif";
case STR4_INT_L('.','h','t','m'): return "text/html";
case STR4_INT_L('.','j','p','g'): return "image/jpeg";
case STR4_INT_L('.','j','s',' '): return "text/javascript";
case STR4_INT_L('.','p','n','g'): return "image/png";
case STR4_INT_L('.','t','x','t'): return "text/plain";
}

if (LIKELY(*last_dot)) {
if (LIKELY(last_dot && *last_dot)) {
uint64_t key = 0;
const unsigned char *extension;

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
Expand All @@ -157,17 +150,11 @@ lwan_determine_mime_type_for_file_name(const char *file_name)
* 8 bytes per extension. */
strncpy((char *)&key, last_dot + 1, 8);
#pragma GCC diagnostic pop
key &= ~0x2020202020202020ull;
key = htobe64(key);

extension = bsearch(&key, uncompressed_mime_entries, MIME_ENTRIES, 8,
compare_mime_entry);
if (LIKELY(extension))
return mime_types[(extension - uncompressed_mime_entries) / 8];
return bsearch_mime_type(htobe64(key & ~0x2020202020202020ull));
}

fallback:
return "application/octet-stream";
return mime_types[MIME_ENTRY_FALLBACK];
}

#include "lookup-http-status.h" /* genrated by statuslookupgen */
Expand Down

0 comments on commit 5d77176

Please sign in to comment.