The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

214 lignes
7.3 KiB

  1. #define _GNU_SOURCE
  2. #include <ctype.h>
  3. #include <stdbool.h>
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #ifndef BUFSIZE
  8. #define BUFSIZE 1048576
  9. #endif
  10. #define STATE_BEFORE_RECORD 0
  11. #define STATE_RESPONSE_RECORD 1
  12. #define STATE_OTHER_RECORD 2
  13. #ifdef DEBUG
  14. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  15. #else
  16. #define DEBUG_PRINTF(...) do {} while (false)
  17. #endif
  18. int main(int argc, char* argv[]) {
  19. // Read stdin, decode WARC, dump all response record bodies to stdout.
  20. // One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
  21. // If the --meta option is given, one line is printed before each record consisting of the WARC-Target-URI, a space, the record length in bytes in decimal notation, and a LF.
  22. // Headers must fit into BUFSIZE.
  23. // Does not fully comply with the WARC spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
  24. char buf[2 * BUFSIZE];
  25. size_t n;
  26. int state = STATE_BEFORE_RECORD;
  27. char* bufp;
  28. char* m0;
  29. char* m1;
  30. char* eoh;
  31. size_t record_bytes_read = 0;
  32. size_t record_length;
  33. long int nscan;
  34. bool meta = false;
  35. if (argc == 2 && strcmp(argv[1], "--meta") == 0) {
  36. meta = true;
  37. }
  38. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  39. bufp = buf;
  40. checkstate:
  41. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
  42. DEBUG_PRINTF("Beginning of buffer: ");
  43. for (int i = 0; i < 64; ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF);
  44. DEBUG_PRINTF("\n");
  45. if (n == 0) {
  46. break;
  47. }
  48. DEBUG_PRINTF("State: %d\n", state);
  49. if (state == STATE_BEFORE_RECORD) {
  50. if (n < 10) {
  51. fprintf(stderr, "Error: too little data before WARC headers\n");
  52. return 1;
  53. }
  54. if (memcmp(bufp, "WARC/1.0\r\n", 10) == 0 || memcmp(bufp, "WARC/1.1\r\n", 10) == 0) {
  55. // Got some headers; find the record type, content length, and end of headers
  56. eoh = memmem(bufp, n, "\r\n\r\n", 4);
  57. if (!eoh) {
  58. fprintf(stderr, "Error: end of headers not found\n");
  59. return 1;
  60. }
  61. eoh += 4;
  62. DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);
  63. m0 = memmem(bufp, n, "\r\nContent-Length:", 17);
  64. if (!m0 || m0 >= eoh) {
  65. fprintf(stderr, "Error: Content-Length missing\n");
  66. return 1;
  67. }
  68. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  69. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  70. if (!m1) {
  71. fprintf(stderr, "Error: CRLF after Content-Length missing\n");
  72. return 1;
  73. }
  74. m0 += 17;
  75. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  76. if (sscanf(m0, "%zu%ln", &record_length, &nscan) <= 0) {
  77. fprintf(stderr, "Error: invalid Content-Length\n");
  78. return 1;
  79. }
  80. if (nscan < 0) {
  81. fprintf(stderr, "Error: negative nscan\n");
  82. return 1;
  83. }
  84. if (m0 + nscan > bufp + n) {
  85. fprintf(stderr, "Error: buffer overread\n");
  86. return 1;
  87. }
  88. m0 += nscan;
  89. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  90. if (m0 != m1) {
  91. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  92. return 1;
  93. }
  94. DEBUG_PRINTF("Record body length: %zu\n", record_length);
  95. m0 = memmem(bufp, n, "\r\nWARC-Type:", 12);
  96. if (!m0 || m0 >= eoh) {
  97. fprintf(stderr, "Error: WARC-Type missing\n");
  98. return 1;
  99. }
  100. DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  101. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  102. if (!m1) {
  103. fprintf(stderr, "Error: CRLF after WARC-Type missing\n");
  104. return 1;
  105. }
  106. m0 += 12;
  107. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  108. if (memcmp(m0, "response", 8) == 0) {
  109. DEBUG_PRINTF("Response record\n");
  110. state = STATE_RESPONSE_RECORD;
  111. } else {
  112. DEBUG_PRINTF("Other record\n");
  113. state = STATE_OTHER_RECORD;
  114. }
  115. if (meta && state == STATE_RESPONSE_RECORD) {
  116. m0 = memmem(bufp, n, "\r\nWARC-Target-URI:", 18);
  117. if (!m0 || m0 >= eoh) {
  118. fprintf(stderr, "Error: WARC-Target-URI missing\n");
  119. return 1;
  120. }
  121. DEBUG_PRINTF("Found WARC-Target-URI header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  122. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  123. if (!m1) {
  124. fprintf(stderr, "Error: CRLF after WARC-Target-URI missing\n");
  125. return 1;
  126. }
  127. m0 += 18;
  128. while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
  129. DEBUG_PRINTF("WARC-Target-URI value starts at %p (offset %zu)\n", (void*)m0, m0 - bufp);
  130. --m1;
  131. while (m1 > m0 && (*m1 == ' ' || *m1 == '\t')) --m1;
  132. DEBUG_PRINTF("WARC-Target-URI value ends at %p (offset %zu)\n", (void*)(m1 + 1), m1 + 1 - bufp);
  133. if (m1 <= m0) {
  134. fprintf(stderr, "Error: empty WARC-Target-URI\n");
  135. return 1;
  136. }
  137. fwrite(m0, 1, m1 + 1 - m0, stdout);
  138. fprintf(stdout, " %zu\n", record_length);
  139. }
  140. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp);
  141. n = n - (eoh - bufp);
  142. bufp = eoh;
  143. record_bytes_read = 0;
  144. goto checkstate;
  145. } else {
  146. fprintf(stderr, "Error: expected header line, got something else\n");
  147. return 1;
  148. }
  149. } else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) {
  150. if (record_length + 4 - record_bytes_read > n) {
  151. // Only got part of the record body
  152. DEBUG_PRINTF("Partial record\n");
  153. // Handle the case when the terminating CRLFCRLF is truncated
  154. size_t tocopy = record_length - record_bytes_read > n ? n : record_length - record_bytes_read;
  155. if (state == STATE_RESPONSE_RECORD) {
  156. DEBUG_PRINTF("Copying %zu bytes to stdout\n", tocopy);
  157. fwrite(bufp, 1, tocopy, stdout);
  158. }
  159. record_bytes_read += tocopy;
  160. DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length);
  161. if (tocopy != n) {
  162. DEBUG_PRINTF("Truncated end of block\n");
  163. n = n - tocopy;
  164. bufp = bufp + tocopy;
  165. if (n < BUFSIZE) { // Should always be true
  166. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  167. memmove(buf, bufp, n);
  168. bufp = buf;
  169. n += fread(buf + n, 1, BUFSIZE, stdin);
  170. }
  171. goto checkstate;
  172. }
  173. } else {
  174. // Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  175. DEBUG_PRINTF("Full record\n");
  176. if (state == STATE_RESPONSE_RECORD) {
  177. DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read);
  178. fwrite(bufp, 1, record_length - record_bytes_read, stdout);
  179. fprintf(stdout, "\n");
  180. }
  181. if (memcmp(bufp + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
  182. fprintf(stderr, "Error: end of block not found\n");
  183. return 1;
  184. }
  185. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", record_length + 4 - record_bytes_read);
  186. n = n - (record_length + 4 - record_bytes_read);
  187. bufp = bufp + record_length + 4 - record_bytes_read;
  188. if (n < BUFSIZE) {
  189. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  190. memmove(buf, bufp, n);
  191. bufp = buf;
  192. n += fread(buf + n, 1, BUFSIZE, stdin);
  193. }
  194. state = STATE_BEFORE_RECORD;
  195. goto checkstate;
  196. }
  197. }
  198. }
  199. if (state != STATE_BEFORE_RECORD) {
  200. fprintf(stderr, "Error: incomplete record at the end of input\n");
  201. return 1;
  202. }
  203. }