From acd2fab899d952edaa34df7c736e0c9935d73431 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 1 Jan 2023 23:00:24 +0000 Subject: [PATCH] Add warc-dump-responses --- warc-dump-responses | 1 + warc-dump-responses.c | 181 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 120000 warc-dump-responses create mode 100644 warc-dump-responses.c diff --git a/warc-dump-responses b/warc-dump-responses new file mode 120000 index 0000000..4c68fa7 --- /dev/null +++ b/warc-dump-responses @@ -0,0 +1 @@ +.make-and-exec \ No newline at end of file diff --git a/warc-dump-responses.c b/warc-dump-responses.c new file mode 100644 index 0000000..a9e1275 --- /dev/null +++ b/warc-dump-responses.c @@ -0,0 +1,181 @@ +#define _GNU_SOURCE +#include +#include +#include +#include + +#ifndef BUFSIZE +#define BUFSIZE 1048576 +#endif + +#define STATE_BEFORE_RECORD 0 +#define STATE_RESPONSE_RECORD 1 +#define STATE_OTHER_RECORD 2 + +#ifdef DEBUG +#define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false) +#else +#define DEBUG_PRINTF(...) do {} while (false) +#endif + +int main(int argc, char* argv[]) { + //TODO --meta or a similar way to get something like that? + + // Read stdin, decode WARC, dump all response record bodies to stdout. + // One LF is inserted at the end of each response to ensure that a new record always begins on a new line. + // Headers must fit into BUFSIZE. + char buf[2 * BUFSIZE]; + size_t n; + int state = STATE_BEFORE_RECORD; + char* m0; + char* m1; + size_t record_bytes_read; + size_t record_length; + size_t nscan; + + while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) { +checkstate: + DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)&buf); + if (n == 0) { + break; + } + DEBUG_PRINTF("State: %d\n", state); + if (state == STATE_BEFORE_RECORD) { + if (n < 10) { + fprintf(stderr, "Error: too little data before WARC headers"); + return 1; + } + if (memcmp(buf, "WARC/1.0\r\n", 10) == 0 || memcmp(buf, "WARC/1.1\r\n", 10) == 0) { + // Got some headers; find the record type, content length, and end of headers + m0 = memmem(buf, n, "\r\nContent-Length:", 17); + if (!m0) { + fprintf(stderr, "Error: Content-Length missing"); + return 1; + } + DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf); + m1 = memmem(m0 + 1, n - (m0 + 1 - buf), "\r\n", 2); + if (!m1) { + fprintf(stderr, "Error: CRLF after Content-Length missing"); + return 1; + } + m0 += 17; + while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0; + if (!sscanf(m0, "%zu%n", &record_length, &nscan)) { + fprintf(stderr, "Error: invalid Content-Length"); + return 1; + } + if (nscan > n - (m0 - buf)) { + fprintf(stderr, "Error: buffer overread"); + return 1; + } + m0 += nscan; + while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0; + if (m0 != m1) { + fprintf(stderr, "Error: invalid Content-Length (noise before EOL)"); + return 1; + } + DEBUG_PRINTF("Record body length: %zu\n", record_length); + + m0 = memmem(buf, n, "\r\nWARC-Type:", 12); + if (!m0) { + fprintf(stderr, "Error: WARC-Type missing"); + return 1; + } + DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf); + m1 = memmem(m0, n - (m0 - buf), "\r\n", 2); + if (!m1) { + fprintf(stderr, "Error: CRLF after WARC-Type missing"); + return 1; + } + m0 += 12; + while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0; + if (memcmp(m0, "response", 8) == 0) { + DEBUG_PRINTF("Response record\n"); + state = STATE_RESPONSE_RECORD; + } else { + DEBUG_PRINTF("Other record\n"); + state = STATE_OTHER_RECORD; + } + + m0 = memmem(buf, n, "\r\n\r\n", 4); + if (!m0) { + fprintf(stderr, "Error: end of headers not found"); + return 1; + } + m0 += 4; + DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)m0, m0 - buf); + + //TODO Replace all the memmove business with pointer logic to avoid needless constant memory copying (is more wrooom). + DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (m0 - buf), m0 - buf); + memmove(buf, m0, n - (m0 - buf)); + n = n - (m0 - buf); + record_bytes_read = 0; + goto checkstate; + + // Previous code for this case, but the state handling below should do the same thing with less duplication + /* + if (record_length + 4 > n - (m0 - buf)) { + // The whole record isn't in this buffer, print what we have and continue with the loop. + if (state == STATE_RESPONSE_RECORD) { + fwrite(m0, 1, n - (m0 - buf), stdout); + } + record_bytes_read = n - (m0 - buf); + } else { + // The buffer contains the entire record. Print it, then adjust buf and go back to the beginning of the state processing. + if (state == STATE_RESPONSE_RECORD) { + fwrite(m0, 1, record_length, stdout); + } + if (memcmp(m0 + record_length, "\r\n\r\n", 4) != 0) { + fprintf(stderr, "Error: end of block not found"); + return 1; + } + m0 += record_length + 4; + memmove(buf, m0, n - (m0 - buf)); + n = n - (m0 - buf); + // Fill the buffer back up to ensure that there's at least BUFSIZE bytes in the buffer so the next iteration has the full headers of the next record + if (n < BUFSIZE) { + n += fread(buf + n, 1, BUFSIZE, stdin); + } + state = STATE_BEFORE_RECORD; + goto checkstate; + } + */ + } else { + fprintf(stderr, "Error: expected header line, got something else\n"); + return 1; + } + } else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) { + if (record_length + 4 - record_bytes_read > n) { + // Only got part of the record body + DEBUG_PRINTF("Partial record\n"); + if (state == STATE_RESPONSE_RECORD) { + DEBUG_PRINTF("Copying %zu bytes to stdout\n", n); + fwrite(buf, 1, n, stdout); + } + record_bytes_read += n; + DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length); + } else { + // Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers. + DEBUG_PRINTF("Full record\n"); + if (state == STATE_RESPONSE_RECORD) { + DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read); + fwrite(buf, 1, record_length - record_bytes_read, stdout); + fprintf(stdout, "\n"); + } + if (memcmp(buf + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) { + fprintf(stderr, "Error: end of block not found"); + return 1; + } + DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (record_length + 4 - record_bytes_read), record_length + 4 - record_bytes_read); + memmove(buf, buf + record_length + 4 - record_bytes_read, n - (record_length + 4 - record_bytes_read)); + n = n - (record_length + 4 - record_bytes_read); + if (n < BUFSIZE) { + DEBUG_PRINTF("Refilling buffer\n"); + n += fread(buf + n, 1, BUFSIZE, stdin); + } + state = STATE_BEFORE_RECORD; + goto checkstate; + } + } + } +}