|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386 |
- #define _GNU_SOURCE
- #include <ctype.h>
- #include <stdbool.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
-
- #ifndef BUFSIZE
- #define BUFSIZE 1048576
- #endif
-
- #define STATE_HEADERS 0
- #define STATE_BODY 1 // Body with a Content-Length header
- #define STATE_CHUNK_LINE 2
- #define STATE_CHUNK_CONTENTS 3
-
- #ifdef DEBUG
- #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
- #else
- #define DEBUG_PRINTF(...) do {} while (false)
- #endif
-
- char* memcasemem(char* haystack, size_t haystacklen, char* needle, size_t needlelen) {
- // Case-insensitive (for ASCII) slower version of memmem
- // needle must already be in lower-case.
- if (needlelen > haystacklen) {
- // A needle longer than the haystack can never be in there.
- return NULL;
- }
- char* p1;
- char* p2;
- char* p3;
- for (p1 = haystack; p1 < haystack + haystacklen - needlelen; ++p1) {
- if (tolower((unsigned char)*p1) == *needle) {
- // Found a first char match, check the rest
- // No need to constrain p2; due to the needlelen>haystacklen check above and the limits on p1, p2 can never exceed the haystack.
- for (p2 = p1 + 1, p3 = needle + 1; p3 < needle + needlelen; p2++, p3++) {
- if (tolower((unsigned char)*p2) != *p3) {
- break;
- }
- }
- if (p3 == needle + needlelen) {
- // Full match
- return p1;
- }
- }
- }
- return NULL;
- }
-
- int main(int argc, char* argv[]) {
- // Read stdin, decode HTTP responses, dump all bodies to stdout.
- // stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses).
- // One LF is inserted at the end of each response to ensure that a new response always begins on a new line.
- // If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF.
- // Headers and chunk lines must fit into BUFSIZE.
- // Does not fully comply with the HTTP spec. For example, continuation lines are unsupported.
- char buf[2 * BUFSIZE];
- size_t n;
- int state = STATE_HEADERS;
- char* bufp;
- char* m0;
- char* m1;
- char* eoh;
- long int nscan;
- size_t bytes_read = 0;
- size_t length;
- bool html_fake_base = false;
- char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below.
- size_t urllen;
- bool have_response_length = false;
- size_t response_length;
-
- if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) {
- html_fake_base = true;
- }
-
- while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
- bufp = buf;
- checkstate:
- DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
- DEBUG_PRINTF("Beginning of buffer: ");
- for (unsigned int i = 0; i < (n > 64 ? 64 : n); ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF);
- DEBUG_PRINTF("\n");
- if (n == 0) {
- break;
- }
- DEBUG_PRINTF("State: %d\n", state);
- if (state == STATE_HEADERS) {
- if (n < 9) {
- fprintf(stderr, "Error: too little data before HTTP headers\n");
- return 1;
- }
- // Handle URL + Length line; optional for HTTP/1.1 but required for HTTP/1.0
- url = NULL;
- urllen = 0;
- response_length = 0;
- if (memcmp(bufp, "HTTP/1.1 ", 9) != 0 && memcmp(bufp, "HTTP/1.0 ", 9) != 0) {
- DEBUG_PRINTF("No HTTP header, looking for URL line\n");
- m0 = memmem(bufp, n, "\n", 1);
- if (!m0 || m0 == bufp) {
- fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n");
- return 1;
- }
- m1 = m0;
- // Skip back over length field
- --m0;
- while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0;
- if (*m0 != ' ') {
- fprintf(stderr, "Error: URL line has unexpected format\n");
- return 1;
- }
- // Read length
- if (sscanf(m0, " %zu%ln", &response_length, &nscan) <= 0) {
- fprintf(stderr, "Error: URL line contains no length\n");
- return 1;
- }
- if (nscan != m1 - m0) {
- fprintf(stderr, "Error: URL line length read mismatch\n");
- return 1;
- }
- have_response_length = true;
- DEBUG_PRINTF("Response length: %zu\n", response_length);
- // Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace.
- url = bufp;
- urllen = m0 - bufp;
- if (!memmem(url, urllen, "://", 3)) {
- fprintf(stderr, "Error: URL line contains no scheme\n");
- return 1;
- }
- m0 = url;
- while (m0 < bufp + urllen && *m0 != '\r' && *m0 != '\n' && *m0 != ' ' && *m0 != '\t') ++m0;
- if (m0 != bufp + urllen) {
- fprintf(stderr, "Error: URL contains CR, LF, or whitespace\n");
- return 1;
- }
- DEBUG_PRINTF("Found URL: ");
- for (unsigned int i = 0; i < (urllen > 64 ? 64 : urllen); ++i) DEBUG_PRINTF(isprint(*(url + i)) ? "%c" : "\\x%02x", *(url + i) & 0xFF);
- if (urllen > 64) DEBUG_PRINTF("<...>");
- DEBUG_PRINTF("\n");
- // Skip over URL line and continue processing below
- DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 1 - bufp);
- n = n - (m1 + 1 - bufp);
- bufp = m1 + 1;
- }
- if (html_fake_base && !url) {
- fprintf(stderr, "Error: --html-fake-base requires URL lines\n");
- return 1;
- }
- if (memcmp(bufp, "HTTP/1.0 ", 9) == 0 && !have_response_length) {
- fprintf(stderr, "Error: HTTP/1.0 requires URL metadata lines\n");
- return 1;
- }
-
- if (n < 9) {
- fprintf(stderr, "Error: too little data before HTTP headers\n");
- return 1;
- }
- if (memcmp(bufp, "HTTP/1.1 ", 9) == 0 || memcmp(bufp, "HTTP/1.0 ", 9) == 0) {
- // Got some headers; find transfer encoding, content length, and end of headers
- m0 = memmem(bufp, n, "\r\n\r\n", 4);
- m1 = memmem(bufp, n, "\n\n", 2);
- if (m0 && m1) {
- eoh = (m0 < m1 ? m0 + 4 : m1 + 2);
- } else if (m0) {
- eoh = m0 + 4;
- } else if (m1) {
- eoh = m1 + 2;
- } else {
- eoh = NULL;
- }
- if (!eoh) {
- fprintf(stderr, "Error: end of headers not found\n");
- return 1;
- }
- DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);
-
- if (memcmp(bufp, "HTTP/1.0 ", 9) == 0) {
- // HTTP 1.0 doesn't have TE, so just use the response_length for the content length and skip the other header parsing
- if (bufp + response_length < eoh) {
- fprintf(stderr, "Error: end of headers occurs after alleged response length\n");
- return 1;
- }
- length = response_length - (eoh - bufp);
- state = STATE_BODY;
- } else if ((m0 = memcasemem(bufp, eoh - bufp, "\ncontent-length:", 16)) && m0 < eoh) {
- DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
- m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
- if (!m1) {
- fprintf(stderr, "Error: CRLF after Content-Length missing\n");
- return 1;
- }
- if (*(m1 - 1) == '\r') --m1;
- m0 += 16;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (sscanf(m0, "%zu%ln", &length, &nscan) <= 0) {
- fprintf(stderr, "Error: invalid Content-Length\n");
- return 1;
- }
- if (nscan < 0) {
- fprintf(stderr, "Error: negative nscan\n");
- return 1;
- }
- if (m0 + nscan > bufp + n) {
- fprintf(stderr, "Error: buffer overread\n");
- return 1;
- }
- m0 += nscan;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (m0 != m1) {
- fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
- return 1;
- }
- DEBUG_PRINTF("Content length: %zu\n", length);
-
- state = STATE_BODY;
- } else {
- m0 = memcasemem(bufp, eoh - bufp, "\ntransfer-encoding:", 19);
- if (!m0 || m0 >= eoh) {
- DEBUG_PRINTF("No Content-Length or Transfer-Encoding, falling back to response length\n");
- if (!have_response_length) {
- fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing and no response length from metadata line\n");
- return 1;
- }
- if (bufp + response_length < eoh) {
- fprintf(stderr, "Error: end of headers occurs after alleged response length\n");
- return 1;
- }
- length = response_length - (eoh - bufp);
- state = STATE_BODY;
- } else {
- DEBUG_PRINTF("Found Transfer-Encoding header at %p (offset %zu)\n", (void*)(m0 + 1), m0 + 1 - bufp);
- m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
- if (!m1 || m1 >= eoh - 1) {
- fprintf(stderr, "Error: CRLF after Transfer-Encoding missing\n");
- return 1;
- }
- m0 += 19;
- if (*(m1 - 1) == '\r') --m1;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (memcmp(m0, "chunked", 7) != 0) {
- fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
- return 1;
- }
- m0 += 7;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (m0 != m1) {
- fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
- return 1;
- }
- DEBUG_PRINTF("Chunked transfer encoding\n");
-
- state = STATE_CHUNK_LINE;
- }
- }
-
- if (html_fake_base) {
- m0 = memcasemem(bufp, eoh - bufp, "\ncontent-type:", 14);
- if (m0 && m0 < eoh) {
- DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 1), m0 + 1 - bufp);
- m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
- if (!m1) {
- fprintf(stderr, "Error: CRLF after Content-Type missing\n");
- return 1;
- }
- m0 += 14;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (memcmp(m0, "text/html", 9) == 0) {
- DEBUG_PRINTF("Is HTML response, inserting fake base tag\n");
- fprintf(stdout, "<base href=\"");
- fwrite(url, 1, urllen, stdout);
- fprintf(stdout, "\">\n");
- }
- }
- }
-
- DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp);
- n = n - (eoh - bufp);
- bufp = eoh;
- bytes_read = 0;
- goto checkstate;
- } else {
- fprintf(stderr, "Error: expected header line, got something else\n");
- return 1;
- }
- } else if (state == STATE_BODY || state == STATE_CHUNK_CONTENTS) {
- if (length + (state == STATE_BODY ? 1 : 2) - bytes_read > n) {
- // Only got part of the body
- DEBUG_PRINTF("Partial body\n");
- DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
- fwrite(bufp, 1, n, stdout);
- bytes_read += n;
- DEBUG_PRINTF("%zu of %zu bytes from this response written\n", bytes_read, length);
- } else {
- // Remainder of the response is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
- DEBUG_PRINTF("Full body\n");
- DEBUG_PRINTF("Copying %zu bytes to stdout\n", length - bytes_read);
- fwrite(bufp, 1, length - bytes_read, stdout);
- fprintf(stdout, "\n");
- if (state == STATE_CHUNK_CONTENTS && *(bufp + length - bytes_read) == '\r') {
- // Stupid hack to enforce the CRLF
- ++length;
- }
- if (memcmp(bufp + length - bytes_read, "\n", 1) != 0) {
- fprintf(stderr, "Error: end of HTTP body not found\n");
- return 1;
- }
- DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", length + 1 - bytes_read);
- n = n - (length + 1 - bytes_read);
- bufp = bufp + length + 1 - bytes_read;
- if (n < BUFSIZE) {
- DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
- memmove(buf, bufp, n);
- bufp = buf;
- n += fread(buf + n, 1, BUFSIZE, stdin);
- }
- if (state == STATE_BODY) {
- state = STATE_HEADERS;
- } else {
- state = STATE_CHUNK_LINE;
- }
- goto checkstate;
- }
- } else if (state == STATE_CHUNK_LINE) {
- m1 = memmem(bufp, n, "\r\n", 2);
- if (!m1) {
- fprintf(stderr, "Error: chunk line EOL missing\n");
- return 1;
- }
- m0 = bufp;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (sscanf(m0, "%zx%ln", &length, &nscan) <= 0) {
- fprintf(stderr, "Error: invalid chunk length\n");
- return 1;
- }
- if (nscan < 0) {
- fprintf(stderr, "Error: negative nscan\n");
- return 1;
- }
- if (m0 + nscan > bufp + n) {
- fprintf(stderr, "Error: buffer overread\n");
- return 1;
- }
- m0 += nscan;
- while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (*m0 != ';' && m0 != m1) {
- fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
- return 1;
- }
- DEBUG_PRINTF("Chunk length: %zu bytes\n", length);
-
- DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 2 - bufp);
- n = n - (m1 + 2 - bufp);
- bufp = m1 + 2;
-
- if (length == 0) {
- // End of response, must be followed by CRLF + LF
- if (n < 3) {
- fprintf(stderr, "Error: buffer exhausted while looking for empty chunk CRLF\n");
- return 1;
- }
- if (*(m1 + 2) != '\r' || *(m1 + 3) != '\n' || *(m1 + 4) != '\n') {
- fprintf(stderr, "Error: end of HTTP body not found\n");
- return 1;
- }
- n -= 3;
- bufp += 3;
- state = STATE_HEADERS;
- } else {
- state = STATE_CHUNK_CONTENTS;
- }
-
- if (n < BUFSIZE) {
- DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
- memmove(buf, bufp, n);
- bufp = buf;
- n += fread(buf + n, 1, BUFSIZE, stdin);
- }
- goto checkstate;
- }
- }
- if (state != STATE_HEADERS) {
- fprintf(stderr, "Error: incomplete body at the end of input\n");
- return 1;
- }
- }
|