#define _GNU_SOURCE #include #include #include #include #include #ifndef BUFSIZE #define BUFSIZE 1048576 #endif #define STATE_HEADERS 0 #define STATE_BODY 1 // Body with a Content-Length header #define STATE_CHUNK_LINE 2 #define STATE_CHUNK_CONTENTS 3 #ifdef DEBUG #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false) #else #define DEBUG_PRINTF(...) do {} while (false) #endif char* memcasemem(char* haystack, size_t haystacklen, char* needle, size_t needlelen) { // Case-insensitive (for ASCII) slower version of memmem // needle must already be in lower-case. if (needlelen > haystacklen) { // A needle longer than the haystack can never be in there. return NULL; } char* p1; char* p2; char* p3; for (p1 = haystack; p1 < haystack + haystacklen - needlelen; ++p1) { if (tolower((unsigned char)*p1) == *needle) { // Found a first char match, check the rest // No need to constrain p2; due to the needlelen>haystacklen check above and the limits on p1, p2 can never exceed the haystack. for (p2 = p1 + 1, p3 = needle + 1; p3 < needle + needlelen; p2++, p3++) { if (tolower((unsigned char)*p2) != *p3) { break; } } if (p3 == needle + needlelen) { // Full match return p1; } } } return NULL; } int main(int argc, char* argv[]) { // Read stdin, decode HTTP responses, dump all bodies to stdout. // stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses). // One LF is inserted at the end of each response to ensure that a new response always begins on a new line. // If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake tag: . The line is terminated with a LF. // Headers and chunk lines must fit into BUFSIZE. // Does not fully comply with the HTTP spec. For example, continuation lines are unsupported. char buf[2 * BUFSIZE]; size_t n; int state = STATE_HEADERS; char* bufp; char* m0; char* m1; char* eoh; long int nscan; size_t bytes_read = 0; size_t length; bool html_fake_base = false; char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below. size_t urllen; bool have_response_length = false; size_t response_length; if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) { html_fake_base = true; } while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) { bufp = buf; checkstate: DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp); DEBUG_PRINTF("Beginning of buffer: "); for (unsigned int i = 0; i < (n > 64 ? 64 : n); ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF); DEBUG_PRINTF("\n"); if (n == 0) { break; } DEBUG_PRINTF("State: %d\n", state); if (state == STATE_HEADERS) { if (n < 9) { fprintf(stderr, "Error: too little data before HTTP headers\n"); return 1; } // Handle URL + Length line; optional for HTTP/1.1 but required for HTTP/1.0 url = NULL; urllen = 0; response_length = 0; if (memcmp(bufp, "HTTP/1.1 ", 9) != 0 && memcmp(bufp, "HTTP/1.0 ", 9) != 0) { DEBUG_PRINTF("No HTTP header, looking for URL line\n"); m0 = memmem(bufp, n, "\n", 1); if (!m0 || m0 == bufp) { fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n"); return 1; } m1 = m0; // Skip back over length field --m0; while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0; if (*m0 != ' ') { fprintf(stderr, "Error: URL line has unexpected format\n"); return 1; } // Read length if (sscanf(m0, " %zu%ln", &response_length, &nscan) <= 0) { fprintf(stderr, "Error: URL line contains no length\n"); return 1; } if (nscan != m1 - m0) { fprintf(stderr, "Error: URL line length read mismatch\n"); return 1; } have_response_length = true; DEBUG_PRINTF("Response length: %zu\n", response_length); // Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace. url = bufp; urllen = m0 - bufp; if (!memmem(url, urllen, "://", 3)) { fprintf(stderr, "Error: URL line contains no scheme\n"); return 1; } m0 = url; while (m0 < bufp + urllen && *m0 != '\r' && *m0 != '\n' && *m0 != ' ' && *m0 != '\t') ++m0; if (m0 != bufp + urllen) { fprintf(stderr, "Error: URL contains CR, LF, or whitespace\n"); return 1; } DEBUG_PRINTF("Found URL: "); for (unsigned int i = 0; i < (urllen > 64 ? 64 : urllen); ++i) DEBUG_PRINTF(isprint(*(url + i)) ? "%c" : "\\x%02x", *(url + i) & 0xFF); if (urllen > 64) DEBUG_PRINTF("<...>"); DEBUG_PRINTF("\n"); // Skip over URL line and continue processing below DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 1 - bufp); n = n - (m1 + 1 - bufp); bufp = m1 + 1; } if (html_fake_base && !url) { fprintf(stderr, "Error: --html-fake-base requires URL lines\n"); return 1; } if (memcmp(bufp, "HTTP/1.0 ", 9) == 0 && !have_response_length) { fprintf(stderr, "Error: HTTP/1.0 requires URL metadata lines\n"); return 1; } if (n < 9) { fprintf(stderr, "Error: too little data before HTTP headers\n"); return 1; } if (memcmp(bufp, "HTTP/1.1 ", 9) == 0 || memcmp(bufp, "HTTP/1.0 ", 9) == 0) { // Got some headers; find transfer encoding, content length, and end of headers m0 = memmem(bufp, n, "\r\n\r\n", 4); m1 = memmem(bufp, n, "\n\n", 2); if (m0 && m1) { eoh = (m0 < m1 ? m0 + 4 : m1 + 2); } else if (m0) { eoh = m0 + 4; } else if (m1) { eoh = m1 + 2; } else { eoh = NULL; } if (!eoh) { fprintf(stderr, "Error: end of headers not found\n"); return 1; } DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp); if (memcmp(bufp, "HTTP/1.0 ", 9) == 0) { // HTTP 1.0 doesn't have TE, so just use the response_length for the content length and skip the other header parsing if (bufp + response_length < eoh) { fprintf(stderr, "Error: end of headers occurs after alleged response length\n"); return 1; } length = response_length - (eoh - bufp); state = STATE_BODY; } else if ((m0 = memcasemem(bufp, eoh - bufp, "\ncontent-length:", 16)) && m0 < eoh) { DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1); if (!m1) { fprintf(stderr, "Error: CRLF after Content-Length missing\n"); return 1; } if (*(m1 - 1) == '\r') --m1; m0 += 16; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (sscanf(m0, "%zu%ln", &length, &nscan) <= 0) { fprintf(stderr, "Error: invalid Content-Length\n"); return 1; } if (nscan < 0) { fprintf(stderr, "Error: negative nscan\n"); return 1; } if (m0 + nscan > bufp + n) { fprintf(stderr, "Error: buffer overread\n"); return 1; } m0 += nscan; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (m0 != m1) { fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n"); return 1; } DEBUG_PRINTF("Content length: %zu\n", length); state = STATE_BODY; } else { m0 = memcasemem(bufp, eoh - bufp, "\ntransfer-encoding:", 19); if (!m0 || m0 >= eoh) { DEBUG_PRINTF("No Content-Length or Transfer-Encoding, falling back to response length\n"); if (!have_response_length) { fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing and no response length from metadata line\n"); return 1; } if (bufp + response_length < eoh) { fprintf(stderr, "Error: end of headers occurs after alleged response length\n"); return 1; } length = response_length - (eoh - bufp); state = STATE_BODY; } else { DEBUG_PRINTF("Found Transfer-Encoding header at %p (offset %zu)\n", (void*)(m0 + 1), m0 + 1 - bufp); m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1); if (!m1 || m1 >= eoh - 1) { fprintf(stderr, "Error: CRLF after Transfer-Encoding missing\n"); return 1; } m0 += 19; if (*(m1 - 1) == '\r') --m1; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (memcmp(m0, "chunked", 7) != 0) { fprintf(stderr, "Error: unsupported Transfer-Encoding\n"); return 1; } m0 += 7; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (m0 != m1) { fprintf(stderr, "Error: unsupported Transfer-Encoding\n"); return 1; } DEBUG_PRINTF("Chunked transfer encoding\n"); state = STATE_CHUNK_LINE; } } if (html_fake_base) { m0 = memcasemem(bufp, eoh - bufp, "\ncontent-type:", 14); if (m0 && m0 < eoh) { DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 1), m0 + 1 - bufp); m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1); if (!m1) { fprintf(stderr, "Error: CRLF after Content-Type missing\n"); return 1; } m0 += 14; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (memcmp(m0, "text/html", 9) == 0) { DEBUG_PRINTF("Is HTML response, inserting fake base tag\n"); fprintf(stdout, "\n"); } } } DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp); n = n - (eoh - bufp); bufp = eoh; bytes_read = 0; goto checkstate; } else { fprintf(stderr, "Error: expected header line, got something else\n"); return 1; } } else if (state == STATE_BODY || state == STATE_CHUNK_CONTENTS) { if (length + (state == STATE_BODY ? 1 : 2) - bytes_read > n) { // Only got part of the body DEBUG_PRINTF("Partial body\n"); DEBUG_PRINTF("Copying %zu bytes to stdout\n", n); fwrite(bufp, 1, n, stdout); bytes_read += n; DEBUG_PRINTF("%zu of %zu bytes from this response written\n", bytes_read, length); } else { // Remainder of the response is in the buffer. Same logic as above for small records fitting in the buffer with the headers. DEBUG_PRINTF("Full body\n"); DEBUG_PRINTF("Copying %zu bytes to stdout\n", length - bytes_read); fwrite(bufp, 1, length - bytes_read, stdout); if (state == STATE_BODY) { fprintf(stdout, "\n"); } if (state == STATE_CHUNK_CONTENTS && *(bufp + length - bytes_read) == '\r') { // Stupid hack to enforce the CRLF ++length; } if (memcmp(bufp + length - bytes_read, "\n", 1) != 0) { fprintf(stderr, "Error: end of HTTP body not found\n"); return 1; } DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", length + 1 - bytes_read); n = n - (length + 1 - bytes_read); bufp = bufp + length + 1 - bytes_read; if (n < BUFSIZE) { DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n); memmove(buf, bufp, n); bufp = buf; n += fread(buf + n, 1, BUFSIZE, stdin); } if (state == STATE_BODY) { state = STATE_HEADERS; } else { state = STATE_CHUNK_LINE; } goto checkstate; } } else if (state == STATE_CHUNK_LINE) { m1 = memmem(bufp, n, "\r\n", 2); if (!m1) { fprintf(stderr, "Error: chunk line EOL missing\n"); return 1; } m0 = bufp; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (sscanf(m0, "%zx%ln", &length, &nscan) <= 0) { fprintf(stderr, "Error: invalid chunk length\n"); return 1; } if (nscan < 0) { fprintf(stderr, "Error: negative nscan\n"); return 1; } if (m0 + nscan > bufp + n) { fprintf(stderr, "Error: buffer overread\n"); return 1; } m0 += nscan; while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0; if (*m0 != ';' && m0 != m1) { fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n"); return 1; } DEBUG_PRINTF("Chunk length: %zu bytes\n", length); DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 2 - bufp); n = n - (m1 + 2 - bufp); bufp = m1 + 2; if (length == 0) { // End of response, must be followed by CRLF + LF if (n < 3) { fprintf(stderr, "Error: buffer exhausted while looking for empty chunk CRLF\n"); return 1; } if (*(m1 + 2) != '\r' || *(m1 + 3) != '\n' || *(m1 + 4) != '\n') { fprintf(stderr, "Error: end of HTTP body not found\n"); return 1; } fprintf(stdout, "\n"); n -= 3; bufp += 3; state = STATE_HEADERS; } else { state = STATE_CHUNK_CONTENTS; } if (n < BUFSIZE) { DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n); memmove(buf, bufp, n); bufp = buf; n += fread(buf + n, 1, BUFSIZE, stdin); } goto checkstate; } } if (state != STATE_HEADERS) { fprintf(stderr, "Error: incomplete body at the end of input\n"); return 1; } }