#define _GNU_SOURCE #include #include #include #include #include #ifndef BUFSIZE #define BUFSIZE 1048576 #endif #define STATE_BEFORE_RECORD 0 #define STATE_RESPONSE_RECORD 1 #define STATE_OTHER_RECORD 2 #ifdef DEBUG #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false) #else #define DEBUG_PRINTF(...) do {} while (false) #endif int main(int argc, char* argv[]) { // Read stdin, decode WARC, dump all response record bodies to stdout. // One LF is inserted at the end of each response to ensure that a new record always begins on a new line. // If the --meta option is given, one line is printed before each record consisting of the WARC-Target-URI, a space, the record length in bytes in decimal notation, and a LF. // Headers must fit into BUFSIZE. // Does not fully comply with the WARC spec. For example, headers must be capitalised canonically, and continuation lines are unsupported. char buf[2 * BUFSIZE]; size_t n; int state = STATE_BEFORE_RECORD; char* bufp; char* m0; char* m1; char* eoh; size_t record_bytes_read = 0; size_t record_length; long int nscan; bool meta = false; if (argc == 2 && strcmp(argv[1], "--meta") == 0) { meta = true; } while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) { bufp = buf; checkstate: DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp); DEBUG_PRINTF("Beginning of buffer: "); for (int i = 0; i < 64; ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF); DEBUG_PRINTF("\n"); if (n == 0) { break; } DEBUG_PRINTF("State: %d\n", state); if (state == STATE_BEFORE_RECORD) { if (n < 10) { fprintf(stderr, "Error: too little data before WARC headers\n"); return 1; } if (memcmp(bufp, "WARC/1.0\r\n", 10) == 0 || memcmp(bufp, "WARC/1.1\r\n", 10) == 0) { // Got some headers; find the record type, content length, and end of headers eoh = memmem(bufp, n, "\r\n\r\n", 4); if (!eoh) { fprintf(stderr, "Error: end of headers not found\n"); return 1; } eoh += 4; DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp); m0 = memmem(bufp, n, "\r\nContent-Length:", 17); if (!m0 || m0 >= eoh) { fprintf(stderr, "Error: Content-Length missing\n"); return 1; } DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2); if (!m1) { fprintf(stderr, "Error: CRLF after Content-Length missing\n"); return 1; } m0 += 17; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (sscanf(m0, "%zu%ln", &record_length, &nscan) <= 0) { fprintf(stderr, "Error: invalid Content-Length\n"); return 1; } if (nscan < 0) { fprintf(stderr, "Error: negative nscan\n"); return 1; } if (m0 + nscan > bufp + n) { fprintf(stderr, "Error: buffer overread\n"); return 1; } m0 += nscan; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (m0 != m1) { fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n"); return 1; } DEBUG_PRINTF("Record body length: %zu\n", record_length); m0 = memmem(bufp, n, "\r\nWARC-Type:", 12); if (!m0 || m0 >= eoh) { fprintf(stderr, "Error: WARC-Type missing\n"); return 1; } DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2); if (!m1) { fprintf(stderr, "Error: CRLF after WARC-Type missing\n"); return 1; } m0 += 12; while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0; if (memcmp(m0, "response", 8) == 0) { DEBUG_PRINTF("Response record\n"); state = STATE_RESPONSE_RECORD; } else { DEBUG_PRINTF("Other record\n"); state = STATE_OTHER_RECORD; } if (meta && state == STATE_RESPONSE_RECORD) { m0 = memmem(bufp, n, "\r\nWARC-Target-URI:", 18); if (!m0 || m0 >= eoh) { fprintf(stderr, "Error: WARC-Target-URI missing\n"); return 1; } DEBUG_PRINTF("Found WARC-Target-URI header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp); m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2); if (!m1) { fprintf(stderr, "Error: CRLF after WARC-Target-URI missing\n"); return 1; } m0 += 18; while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0; DEBUG_PRINTF("WARC-Target-URI value starts at %p (offset %zu)\n", (void*)m0, m0 - bufp); --m1; while (m1 > m0 && (*m1 == ' ' || *m1 == '\t')) --m1; DEBUG_PRINTF("WARC-Target-URI value ends at %p (offset %zu)\n", (void*)(m1 + 1), m1 + 1 - bufp); if (m1 <= m0) { fprintf(stderr, "Error: empty WARC-Target-URI\n"); return 1; } fwrite(m0, 1, m1 + 1 - m0, stdout); fprintf(stdout, " %zu\n", record_length); } DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp); n = n - (eoh - bufp); bufp = eoh; record_bytes_read = 0; goto checkstate; } else { fprintf(stderr, "Error: expected header line, got something else\n"); return 1; } } else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) { if (record_length + 4 - record_bytes_read > n) { // Only got part of the record body DEBUG_PRINTF("Partial record\n"); // Handle the case when the terminating CRLFCRLF is truncated size_t tocopy = record_length - record_bytes_read > n ? n : record_length - record_bytes_read; if (state == STATE_RESPONSE_RECORD) { DEBUG_PRINTF("Copying %zu bytes to stdout\n", tocopy); fwrite(bufp, 1, tocopy, stdout); } record_bytes_read += tocopy; DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length); if (tocopy != n) { DEBUG_PRINTF("Truncated end of block\n"); n = n - tocopy; bufp = bufp + tocopy; if (n < BUFSIZE) { // Should always be true DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n); memmove(buf, bufp, n); bufp = buf; n += fread(buf + n, 1, BUFSIZE, stdin); } goto checkstate; } } else { // Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers. DEBUG_PRINTF("Full record\n"); if (state == STATE_RESPONSE_RECORD) { DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read); fwrite(bufp, 1, record_length - record_bytes_read, stdout); fprintf(stdout, "\n"); } if (memcmp(bufp + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) { fprintf(stderr, "Error: end of block not found\n"); return 1; } DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", record_length + 4 - record_bytes_read); n = n - (record_length + 4 - record_bytes_read); bufp = bufp + record_length + 4 - record_bytes_read; if (n < BUFSIZE) { DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n); memmove(buf, bufp, n); bufp = buf; n += fread(buf + n, 1, BUFSIZE, stdin); } state = STATE_BEFORE_RECORD; goto checkstate; } } } if (state != STATE_BEFORE_RECORD) { fprintf(stderr, "Error: incomplete record at the end of input\n"); return 1; } }