Browse Source

Add support for non-standard header capitalisation

master
JustAnotherArchivist 1 year ago
parent
commit
887c063533
1 changed files with 31 additions and 4 deletions
  1. +31
    -4
      http-response-bodies.c

+ 31
- 4
http-response-bodies.c View File

@@ -20,13 +20,40 @@
#define DEBUG_PRINTF(...) do {} while (false)
#endif

char* memcasemem(char* haystack, size_t haystacklen, char* needle, size_t needlelen) {
// Case-insensitive (for ASCII) slower version of memmem
// needle must already be in lower-case.
if (needlelen > haystacklen) {
// A needle longer than the haystack can never be in there.
return NULL;
}
char* p1;
char* p2;
char* p3;
for (p1 = haystack; p1 < haystack + haystacklen - needlelen; ++p1) {
if (tolower((unsigned char)*p1) == *needle) {
// Found a first char match, check the rest
// No need to constrain p2; due to the needlelen>haystacklen check above and the limits on p1, p2 can never exceed the haystack.
for (p2 = p1 + 1, p3 = needle + 1; p3 < needle + needlelen; p2++, p3++) {
if (tolower((unsigned char)*p2) != *p3) {
break;
}
}
if (p3 == needle + needlelen) {
// Full match
return p1;
}
}
}
}

int main(int argc, char* argv[]) {
// Read stdin, decode HTTP responses, dump all bodies to stdout.
// stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses).
// One LF is inserted at the end of each response to ensure that a new response always begins on a new line.
// If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF.
// Headers and chunk lines must fit into BUFSIZE.
// Does not fully comply with the HTTP spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
// Does not fully comply with the HTTP spec. For example, continuation lines are unsupported.
char buf[2 * BUFSIZE];
size_t n;
int state = STATE_HEADERS;
@@ -119,7 +146,7 @@ checkstate:
eoh += 4;
DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);

m0 = memmem(bufp, n, "\r\nContent-Length:", 17);
m0 = memcasemem(bufp, n, "\r\ncontent-length:", 17);
if (m0 && m0 < eoh) {
DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
@@ -151,7 +178,7 @@ checkstate:

state = STATE_BODY;
} else {
m0 = memmem(bufp, n, "\r\nTransfer-Encoding:", 20);
m0 = memcasemem(bufp, n, "\r\ntransfer-encoding:", 20);
if (!m0 || m0 >= eoh) {
fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing\n");
return 1;
@@ -180,7 +207,7 @@ checkstate:
}

if (html_fake_base) {
m0 = memmem(bufp, n, "\r\nContent-Type:", 15);
m0 = memcasemem(bufp, n, "\r\ncontent-type:", 15);
if (m0 && m0 < eoh) {
DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);


Loading…
Cancel
Save