The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

214 lines
7.3 KiB

  1. #define _GNU_SOURCE
  2. #include <ctype.h>
  3. #include <stdbool.h>
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #ifndef BUFSIZE
  8. #define BUFSIZE 1048576
  9. #endif
  10. #define STATE_BEFORE_RECORD 0
  11. #define STATE_RESPONSE_RECORD 1
  12. #define STATE_OTHER_RECORD 2
  13. #ifdef DEBUG
  14. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  15. #else
  16. #define DEBUG_PRINTF(...) do {} while (false)
  17. #endif
  18. int main(int argc, char* argv[]) {
  19. // Read stdin, decode WARC, dump all response record bodies to stdout.
  20. // One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
  21. // If the --meta option is given, one line is printed before each record consisting of the WARC-Target-URI, a space, the record length in bytes in decimal notation, and a LF.
  22. // Headers must fit into BUFSIZE.
  23. // Does not fully comply with the WARC spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
  24. char buf[2 * BUFSIZE];
  25. size_t n;
  26. int state = STATE_BEFORE_RECORD;
  27. char* bufp;
  28. char* m0;
  29. char* m1;
  30. char* eoh;
  31. size_t record_bytes_read = 0;
  32. size_t record_length;
  33. long int nscan;
  34. bool meta = false;
  35. if (argc == 2 && strcmp(argv[1], "--meta") == 0) {
  36. meta = true;
  37. }
  38. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  39. bufp = buf;
  40. checkstate:
  41. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
  42. DEBUG_PRINTF("Beginning of buffer: ");
  43. for (int i = 0; i < 64; ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF);
  44. DEBUG_PRINTF("\n");
  45. if (n == 0) {
  46. break;
  47. }
  48. DEBUG_PRINTF("State: %d\n", state);
  49. if (state == STATE_BEFORE_RECORD) {
  50. if (n < 10) {
  51. fprintf(stderr, "Error: too little data before WARC headers\n");
  52. return 1;
  53. }
  54. if (memcmp(bufp, "WARC/1.0\r\n", 10) == 0 || memcmp(bufp, "WARC/1.1\r\n", 10) == 0) {
  55. // Got some headers; find the record type, content length, and end of headers
  56. eoh = memmem(bufp, n, "\r\n\r\n", 4);
  57. if (!eoh) {
  58. fprintf(stderr, "Error: end of headers not found\n");
  59. return 1;
  60. }
  61. eoh += 4;
  62. DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);
  63. m0 = memmem(bufp, n, "\r\nContent-Length:", 17);
  64. if (!m0 || m0 >= eoh) {
  65. fprintf(stderr, "Error: Content-Length missing\n");
  66. return 1;
  67. }
  68. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  69. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  70. if (!m1) {
  71. fprintf(stderr, "Error: CRLF after Content-Length missing\n");
  72. return 1;
  73. }
  74. m0 += 17;
  75. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  76. if (sscanf(m0, "%zu%ln", &record_length, &nscan) <= 0) {
  77. fprintf(stderr, "Error: invalid Content-Length\n");
  78. return 1;
  79. }
  80. if (nscan < 0) {
  81. fprintf(stderr, "Error: negative nscan\n");
  82. return 1;
  83. }
  84. if (m0 + nscan > bufp + n) {
  85. fprintf(stderr, "Error: buffer overread\n");
  86. return 1;
  87. }
  88. m0 += nscan;
  89. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  90. if (m0 != m1) {
  91. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  92. return 1;
  93. }
  94. DEBUG_PRINTF("Record body length: %zu\n", record_length);
  95. m0 = memmem(bufp, n, "\r\nWARC-Type:", 12);
  96. if (!m0 || m0 >= eoh) {
  97. fprintf(stderr, "Error: WARC-Type missing\n");
  98. return 1;
  99. }
  100. DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  101. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  102. if (!m1) {
  103. fprintf(stderr, "Error: CRLF after WARC-Type missing\n");
  104. return 1;
  105. }
  106. m0 += 12;
  107. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  108. if (memcmp(m0, "response", 8) == 0) {
  109. DEBUG_PRINTF("Response record\n");
  110. state = STATE_RESPONSE_RECORD;
  111. } else {
  112. DEBUG_PRINTF("Other record\n");
  113. state = STATE_OTHER_RECORD;
  114. }
  115. if (meta && state == STATE_RESPONSE_RECORD) {
  116. m0 = memmem(bufp, n, "\r\nWARC-Target-URI:", 18);
  117. if (!m0 || m0 >= eoh) {
  118. fprintf(stderr, "Error: WARC-Target-URI missing\n");
  119. return 1;
  120. }
  121. DEBUG_PRINTF("Found WARC-Target-URI header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  122. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  123. if (!m1) {
  124. fprintf(stderr, "Error: CRLF after WARC-Target-URI missing\n");
  125. return 1;
  126. }
  127. m0 += 18;
  128. while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
  129. DEBUG_PRINTF("WARC-Target-URI value starts at %p (offset %zu)\n", (void*)m0, m0 - bufp);
  130. --m1;
  131. while (m1 > m0 && (*m1 == ' ' || *m1 == '\t')) --m1;
  132. DEBUG_PRINTF("WARC-Target-URI value ends at %p (offset %zu)\n", (void*)(m1 + 1), m1 + 1 - bufp);
  133. if (m1 <= m0) {
  134. fprintf(stderr, "Error: empty WARC-Target-URI\n");
  135. return 1;
  136. }
  137. fwrite(m0, 1, m1 + 1 - m0, stdout);
  138. fprintf(stdout, " %zu\n", record_length);
  139. }
  140. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp);
  141. n = n - (eoh - bufp);
  142. bufp = eoh;
  143. record_bytes_read = 0;
  144. goto checkstate;
  145. } else {
  146. fprintf(stderr, "Error: expected header line, got something else\n");
  147. return 1;
  148. }
  149. } else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) {
  150. if (record_length + 4 - record_bytes_read > n) {
  151. // Only got part of the record body
  152. DEBUG_PRINTF("Partial record\n");
  153. // Handle the case when the terminating CRLFCRLF is truncated
  154. size_t tocopy = record_length - record_bytes_read > n ? n : record_length - record_bytes_read;
  155. if (state == STATE_RESPONSE_RECORD) {
  156. DEBUG_PRINTF("Copying %zu bytes to stdout\n", tocopy);
  157. fwrite(bufp, 1, tocopy, stdout);
  158. }
  159. record_bytes_read += tocopy;
  160. DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length);
  161. if (tocopy != n) {
  162. DEBUG_PRINTF("Truncated end of block\n");
  163. n = n - tocopy;
  164. bufp = bufp + tocopy;
  165. if (n < BUFSIZE) { // Should always be true
  166. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  167. memmove(buf, bufp, n);
  168. bufp = buf;
  169. n += fread(buf + n, 1, BUFSIZE, stdin);
  170. }
  171. goto checkstate;
  172. }
  173. } else {
  174. // Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  175. DEBUG_PRINTF("Full record\n");
  176. if (state == STATE_RESPONSE_RECORD) {
  177. DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read);
  178. fwrite(bufp, 1, record_length - record_bytes_read, stdout);
  179. fprintf(stdout, "\n");
  180. }
  181. if (memcmp(bufp + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
  182. fprintf(stderr, "Error: end of block not found\n");
  183. return 1;
  184. }
  185. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", record_length + 4 - record_bytes_read);
  186. n = n - (record_length + 4 - record_bytes_read);
  187. bufp = bufp + record_length + 4 - record_bytes_read;
  188. if (n < BUFSIZE) {
  189. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  190. memmove(buf, bufp, n);
  191. bufp = buf;
  192. n += fread(buf + n, 1, BUFSIZE, stdin);
  193. }
  194. state = STATE_BEFORE_RECORD;
  195. goto checkstate;
  196. }
  197. }
  198. }
  199. if (state != STATE_BEFORE_RECORD) {
  200. fprintf(stderr, "Error: incomplete record at the end of input\n");
  201. return 1;
  202. }
  203. }