diff --git a/http-response-bodies.c b/http-response-bodies.c index f3952de..07e0223 100644 --- a/http-response-bodies.c +++ b/http-response-bodies.c @@ -20,13 +20,40 @@ #define DEBUG_PRINTF(...) do {} while (false) #endif +char* memcasemem(char* haystack, size_t haystacklen, char* needle, size_t needlelen) { + // Case-insensitive (for ASCII) slower version of memmem + // needle must already be in lower-case. + if (needlelen > haystacklen) { + // A needle longer than the haystack can never be in there. + return NULL; + } + char* p1; + char* p2; + char* p3; + for (p1 = haystack; p1 < haystack + haystacklen - needlelen; ++p1) { + if (tolower((unsigned char)*p1) == *needle) { + // Found a first char match, check the rest + // No need to constrain p2; due to the needlelen>haystacklen check above and the limits on p1, p2 can never exceed the haystack. + for (p2 = p1 + 1, p3 = needle + 1; p3 < needle + needlelen; p2++, p3++) { + if (tolower((unsigned char)*p2) != *p3) { + break; + } + } + if (p3 == needle + needlelen) { + // Full match + return p1; + } + } + } +} + int main(int argc, char* argv[]) { // Read stdin, decode HTTP responses, dump all bodies to stdout. // stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses). // One LF is inserted at the end of each response to ensure that a new response always begins on a new line. // If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake tag: . The line is terminated with a LF. // Headers and chunk lines must fit into BUFSIZE. - // Does not fully comply with the HTTP spec. For example, headers must be capitalised canonically, and continuation lines are unsupported. + // Does not fully comply with the HTTP spec. For example, continuation lines are unsupported. char buf[2 * BUFSIZE]; size_t n; int state = STATE_HEADERS; @@ -119,7 +146,7 @@ checkstate: eoh += 4; DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp); - m0 = memmem(bufp, n, "\r\nContent-Length:", 17); + m0 = memcasemem(bufp, n, "\r\ncontent-length:", 17); if (m0 && m0 < eoh) { DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2); @@ -151,7 +178,7 @@ checkstate: state = STATE_BODY; } else { - m0 = memmem(bufp, n, "\r\nTransfer-Encoding:", 20); + m0 = memcasemem(bufp, n, "\r\ntransfer-encoding:", 20); if (!m0 || m0 >= eoh) { fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing\n"); return 1; @@ -180,7 +207,7 @@ checkstate: } if (html_fake_base) { - m0 = memmem(bufp, n, "\r\nContent-Type:", 15); + m0 = memcasemem(bufp, n, "\r\ncontent-type:", 15); if (m0 && m0 < eoh) { DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);