|
|
@@ -20,13 +20,40 @@ |
|
|
|
#define DEBUG_PRINTF(...) do {} while (false) |
|
|
|
#endif |
|
|
|
|
|
|
|
char* memcasemem(char* haystack, size_t haystacklen, char* needle, size_t needlelen) { |
|
|
|
// Case-insensitive (for ASCII) slower version of memmem |
|
|
|
// needle must already be in lower-case. |
|
|
|
if (needlelen > haystacklen) { |
|
|
|
// A needle longer than the haystack can never be in there. |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
char* p1; |
|
|
|
char* p2; |
|
|
|
char* p3; |
|
|
|
for (p1 = haystack; p1 < haystack + haystacklen - needlelen; ++p1) { |
|
|
|
if (tolower((unsigned char)*p1) == *needle) { |
|
|
|
// Found a first char match, check the rest |
|
|
|
// No need to constrain p2; due to the needlelen>haystacklen check above and the limits on p1, p2 can never exceed the haystack. |
|
|
|
for (p2 = p1 + 1, p3 = needle + 1; p3 < needle + needlelen; p2++, p3++) { |
|
|
|
if (tolower((unsigned char)*p2) != *p3) { |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
if (p3 == needle + needlelen) { |
|
|
|
// Full match |
|
|
|
return p1; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
int main(int argc, char* argv[]) { |
|
|
|
// Read stdin, decode HTTP responses, dump all bodies to stdout. |
|
|
|
// stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses). |
|
|
|
// One LF is inserted at the end of each response to ensure that a new response always begins on a new line. |
|
|
|
// If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF. |
|
|
|
// Headers and chunk lines must fit into BUFSIZE. |
|
|
|
// Does not fully comply with the HTTP spec. For example, headers must be capitalised canonically, and continuation lines are unsupported. |
|
|
|
// Does not fully comply with the HTTP spec. For example, continuation lines are unsupported. |
|
|
|
char buf[2 * BUFSIZE]; |
|
|
|
size_t n; |
|
|
|
int state = STATE_HEADERS; |
|
|
@@ -119,7 +146,7 @@ checkstate: |
|
|
|
eoh += 4; |
|
|
|
DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp); |
|
|
|
|
|
|
|
m0 = memmem(bufp, n, "\r\nContent-Length:", 17); |
|
|
|
m0 = memcasemem(bufp, n, "\r\ncontent-length:", 17); |
|
|
|
if (m0 && m0 < eoh) { |
|
|
|
DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); |
|
|
|
m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2); |
|
|
@@ -151,7 +178,7 @@ checkstate: |
|
|
|
|
|
|
|
state = STATE_BODY; |
|
|
|
} else { |
|
|
|
m0 = memmem(bufp, n, "\r\nTransfer-Encoding:", 20); |
|
|
|
m0 = memcasemem(bufp, n, "\r\ntransfer-encoding:", 20); |
|
|
|
if (!m0 || m0 >= eoh) { |
|
|
|
fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing\n"); |
|
|
|
return 1; |
|
|
@@ -180,7 +207,7 @@ checkstate: |
|
|
|
} |
|
|
|
|
|
|
|
if (html_fake_base) { |
|
|
|
m0 = memmem(bufp, n, "\r\nContent-Type:", 15); |
|
|
|
m0 = memcasemem(bufp, n, "\r\ncontent-type:", 15); |
|
|
|
if (m0 && m0 < eoh) { |
|
|
|
DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); |
|
|
|
m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2); |
|
|
|