Browse Source

Add warc-dump-responses

master
JustAnotherArchivist 1 year ago
parent
commit
acd2fab899
2 changed files with 182 additions and 0 deletions
  1. +1
    -0
      warc-dump-responses
  2. +181
    -0
      warc-dump-responses.c

+ 1
- 0
warc-dump-responses View File

@@ -0,0 +1 @@
.make-and-exec

+ 181
- 0
warc-dump-responses.c View File

@@ -0,0 +1,181 @@
#define _GNU_SOURCE
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef BUFSIZE
#define BUFSIZE 1048576
#endif

#define STATE_BEFORE_RECORD 0
#define STATE_RESPONSE_RECORD 1
#define STATE_OTHER_RECORD 2

#ifdef DEBUG
#define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
#else
#define DEBUG_PRINTF(...) do {} while (false)
#endif

int main(int argc, char* argv[]) {
//TODO --meta or a similar way to get something like that?

// Read stdin, decode WARC, dump all response record bodies to stdout.
// One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
// Headers must fit into BUFSIZE.
char buf[2 * BUFSIZE];
size_t n;
int state = STATE_BEFORE_RECORD;
char* m0;
char* m1;
size_t record_bytes_read;
size_t record_length;
size_t nscan;

while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
checkstate:
DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)&buf);
if (n == 0) {
break;
}
DEBUG_PRINTF("State: %d\n", state);
if (state == STATE_BEFORE_RECORD) {
if (n < 10) {
fprintf(stderr, "Error: too little data before WARC headers");
return 1;
}
if (memcmp(buf, "WARC/1.0\r\n", 10) == 0 || memcmp(buf, "WARC/1.1\r\n", 10) == 0) {
// Got some headers; find the record type, content length, and end of headers
m0 = memmem(buf, n, "\r\nContent-Length:", 17);
if (!m0) {
fprintf(stderr, "Error: Content-Length missing");
return 1;
}
DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
m1 = memmem(m0 + 1, n - (m0 + 1 - buf), "\r\n", 2);
if (!m1) {
fprintf(stderr, "Error: CRLF after Content-Length missing");
return 1;
}
m0 += 17;
while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
if (!sscanf(m0, "%zu%n", &record_length, &nscan)) {
fprintf(stderr, "Error: invalid Content-Length");
return 1;
}
if (nscan > n - (m0 - buf)) {
fprintf(stderr, "Error: buffer overread");
return 1;
}
m0 += nscan;
while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
if (m0 != m1) {
fprintf(stderr, "Error: invalid Content-Length (noise before EOL)");
return 1;
}
DEBUG_PRINTF("Record body length: %zu\n", record_length);

m0 = memmem(buf, n, "\r\nWARC-Type:", 12);
if (!m0) {
fprintf(stderr, "Error: WARC-Type missing");
return 1;
}
DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
m1 = memmem(m0, n - (m0 - buf), "\r\n", 2);
if (!m1) {
fprintf(stderr, "Error: CRLF after WARC-Type missing");
return 1;
}
m0 += 12;
while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
if (memcmp(m0, "response", 8) == 0) {
DEBUG_PRINTF("Response record\n");
state = STATE_RESPONSE_RECORD;
} else {
DEBUG_PRINTF("Other record\n");
state = STATE_OTHER_RECORD;
}

m0 = memmem(buf, n, "\r\n\r\n", 4);
if (!m0) {
fprintf(stderr, "Error: end of headers not found");
return 1;
}
m0 += 4;
DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)m0, m0 - buf);

//TODO Replace all the memmove business with pointer logic to avoid needless constant memory copying (is more wrooom).
DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (m0 - buf), m0 - buf);
memmove(buf, m0, n - (m0 - buf));
n = n - (m0 - buf);
record_bytes_read = 0;
goto checkstate;

// Previous code for this case, but the state handling below should do the same thing with less duplication
/*
if (record_length + 4 > n - (m0 - buf)) {
// The whole record isn't in this buffer, print what we have and continue with the loop.
if (state == STATE_RESPONSE_RECORD) {
fwrite(m0, 1, n - (m0 - buf), stdout);
}
record_bytes_read = n - (m0 - buf);
} else {
// The buffer contains the entire record. Print it, then adjust buf and go back to the beginning of the state processing.
if (state == STATE_RESPONSE_RECORD) {
fwrite(m0, 1, record_length, stdout);
}
if (memcmp(m0 + record_length, "\r\n\r\n", 4) != 0) {
fprintf(stderr, "Error: end of block not found");
return 1;
}
m0 += record_length + 4;
memmove(buf, m0, n - (m0 - buf));
n = n - (m0 - buf);
// Fill the buffer back up to ensure that there's at least BUFSIZE bytes in the buffer so the next iteration has the full headers of the next record
if (n < BUFSIZE) {
n += fread(buf + n, 1, BUFSIZE, stdin);
}
state = STATE_BEFORE_RECORD;
goto checkstate;
}
*/
} else {
fprintf(stderr, "Error: expected header line, got something else\n");
return 1;
}
} else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) {
if (record_length + 4 - record_bytes_read > n) {
// Only got part of the record body
DEBUG_PRINTF("Partial record\n");
if (state == STATE_RESPONSE_RECORD) {
DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
fwrite(buf, 1, n, stdout);
}
record_bytes_read += n;
DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length);
} else {
// Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
DEBUG_PRINTF("Full record\n");
if (state == STATE_RESPONSE_RECORD) {
DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read);
fwrite(buf, 1, record_length - record_bytes_read, stdout);
fprintf(stdout, "\n");
}
if (memcmp(buf + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
fprintf(stderr, "Error: end of block not found");
return 1;
}
DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (record_length + 4 - record_bytes_read), record_length + 4 - record_bytes_read);
memmove(buf, buf + record_length + 4 - record_bytes_read, n - (record_length + 4 - record_bytes_read));
n = n - (record_length + 4 - record_bytes_read);
if (n < BUFSIZE) {
DEBUG_PRINTF("Refilling buffer\n");
n += fread(buf + n, 1, BUFSIZE, stdin);
}
state = STATE_BEFORE_RECORD;
goto checkstate;
}
}
}
}

Loading…
Cancel
Save