|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- #define _GNU_SOURCE
- #include <ctype.h>
- #include <stdbool.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
-
- #ifndef BUFSIZE
- #define BUFSIZE 1048576
- #endif
-
- #define STATE_BEFORE_RECORD 0
- #define STATE_RESPONSE_RECORD 1
- #define STATE_OTHER_RECORD 2
-
- #ifdef DEBUG
- #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
- #else
- #define DEBUG_PRINTF(...) do {} while (false)
- #endif
-
- int main(int argc, char* argv[]) {
- // Read stdin, decode WARC, dump all response record bodies to stdout.
- // One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
- // If the --meta option is given, one line is printed before each record consisting of the WARC-Target-URI, a space, the record length in bytes in decimal notation, and a LF.
- // Headers must fit into BUFSIZE.
- // Does not fully comply with the WARC spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
- char buf[2 * BUFSIZE];
- size_t n;
- int state = STATE_BEFORE_RECORD;
- char* bufp;
- char* m0;
- char* m1;
- size_t record_bytes_read;
- size_t record_length;
- size_t nscan;
- bool meta = false;
-
- if (argc == 2 && strcmp(argv[1], "--meta") == 0) {
- meta = true;
- }
-
- while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
- bufp = buf;
- checkstate:
- DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
- DEBUG_PRINTF("Beginning of buffer: ");
- for (int i = 0; i < 64; ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF);
- DEBUG_PRINTF("\n");
- if (n == 0) {
- break;
- }
- DEBUG_PRINTF("State: %d\n", state);
- if (state == STATE_BEFORE_RECORD) {
- if (n < 10) {
- fprintf(stderr, "Error: too little data before WARC headers\n");
- return 1;
- }
- if (memcmp(bufp, "WARC/1.0\r\n", 10) == 0 || memcmp(bufp, "WARC/1.1\r\n", 10) == 0) {
- // Got some headers; find the record type, content length, and end of headers
- m0 = memmem(bufp, n, "\r\nContent-Length:", 17);
- if (!m0) {
- fprintf(stderr, "Error: Content-Length missing\n");
- return 1;
- }
- DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
- m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
- if (!m1) {
- fprintf(stderr, "Error: CRLF after Content-Length missing\n");
- return 1;
- }
- m0 += 17;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (!sscanf(m0, "%zu%n", &record_length, &nscan)) {
- fprintf(stderr, "Error: invalid Content-Length\n");
- return 1;
- }
- if (nscan > n - (m0 - bufp)) {
- fprintf(stderr, "Error: buffer overread\n");
- return 1;
- }
- m0 += nscan;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (m0 != m1) {
- fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
- return 1;
- }
- DEBUG_PRINTF("Record body length: %zu\n", record_length);
-
- m0 = memmem(bufp, n, "\r\nWARC-Type:", 12);
- if (!m0) {
- fprintf(stderr, "Error: WARC-Type missing\n");
- return 1;
- }
- DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
- m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
- if (!m1) {
- fprintf(stderr, "Error: CRLF after WARC-Type missing\n");
- return 1;
- }
- m0 += 12;
- while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
- if (memcmp(m0, "response", 8) == 0) {
- DEBUG_PRINTF("Response record\n");
- state = STATE_RESPONSE_RECORD;
- } else {
- DEBUG_PRINTF("Other record\n");
- state = STATE_OTHER_RECORD;
- }
-
- if (meta && state == STATE_RESPONSE_RECORD) {
- m0 = memmem(bufp, n, "\r\nWARC-Target-URI:", 18);
- if (!m0) {
- fprintf(stderr, "Error: WARC-Target-URI missing\n");
- return 1;
- }
- DEBUG_PRINTF("Found WARC-Target-URI header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
- m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
- if (!m1) {
- fprintf(stderr, "Error: CRLF after WARC-Target-URI missing\n");
- return 1;
- }
- m0 += 18;
- while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
- DEBUG_PRINTF("WARC-Target-URI value starts at %p (offset %zu)\n", (void*)m0, m0 - bufp);
- --m1;
- while (m1 > m0 && (*m1 == ' ' || *m1 == '\t')) --m1;
- DEBUG_PRINTF("WARC-Target-URI value ends at %p (offset %zu)\n", (void*)(m1 + 1), m1 + 1 - bufp);
- if (m1 <= m0) {
- fprintf(stderr, "Error: empty WARC-Target-URI\n");
- return 1;
- }
- fwrite(m0, 1, m1 + 1 - m0, stdout);
- fprintf(stdout, " %zu\n", record_length);
- }
-
- m0 = memmem(bufp, n, "\r\n\r\n", 4);
- if (!m0) {
- fprintf(stderr, "Error: end of headers not found\n");
- return 1;
- }
- m0 += 4;
- DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)m0, m0 - bufp);
-
- DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m0 - bufp);
- n = n - (m0 - bufp);
- bufp = m0;
- record_bytes_read = 0;
- goto checkstate;
- } else {
- fprintf(stderr, "Error: expected header line, got something else\n");
- return 1;
- }
- } else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) {
- if (record_length + 4 - record_bytes_read > n) {
- // Only got part of the record body
- DEBUG_PRINTF("Partial record\n");
- if (state == STATE_RESPONSE_RECORD) {
- DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
- fwrite(bufp, 1, n, stdout);
- }
- record_bytes_read += n;
- DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length);
- } else {
- // Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
- DEBUG_PRINTF("Full record\n");
- if (state == STATE_RESPONSE_RECORD) {
- DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read);
- fwrite(bufp, 1, record_length - record_bytes_read, stdout);
- fprintf(stdout, "\n");
- }
- if (memcmp(bufp + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
- fprintf(stderr, "Error: end of block not found\n");
- return 1;
- }
- DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", record_length + 4 - record_bytes_read);
- n = n - (record_length + 4 - record_bytes_read);
- bufp = bufp + record_length + 4 - record_bytes_read;
- if (n < BUFSIZE) {
- DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
- memmove(buf, bufp, n);
- bufp = buf;
- n += fread(buf + n, 1, BUFSIZE, stdin);
- }
- state = STATE_BEFORE_RECORD;
- goto checkstate;
- }
- }
- }
- if (state != STATE_BEFORE_RECORD) {
- fprintf(stderr, "Error: incomplete record at the end of input\n");
- return 1;
- }
- }
|