The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

182 lines
6.2 KiB

  1. #define _GNU_SOURCE
  2. #include <stdbool.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <string.h>
  6. #ifndef BUFSIZE
  7. #define BUFSIZE 1048576
  8. #endif
  9. #define STATE_BEFORE_RECORD 0
  10. #define STATE_RESPONSE_RECORD 1
  11. #define STATE_OTHER_RECORD 2
  12. #ifdef DEBUG
  13. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  14. #else
  15. #define DEBUG_PRINTF(...) do {} while (false)
  16. #endif
  17. int main(int argc, char* argv[]) {
  18. //TODO --meta or a similar way to get something like that?
  19. // Read stdin, decode WARC, dump all response record bodies to stdout.
  20. // One LF is inserted at the end of each response to ensure that a new record always begins on a new line.
  21. // Headers must fit into BUFSIZE.
  22. char buf[2 * BUFSIZE];
  23. size_t n;
  24. int state = STATE_BEFORE_RECORD;
  25. char* m0;
  26. char* m1;
  27. size_t record_bytes_read;
  28. size_t record_length;
  29. size_t nscan;
  30. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  31. checkstate:
  32. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)&buf);
  33. if (n == 0) {
  34. break;
  35. }
  36. DEBUG_PRINTF("State: %d\n", state);
  37. if (state == STATE_BEFORE_RECORD) {
  38. if (n < 10) {
  39. fprintf(stderr, "Error: too little data before WARC headers");
  40. return 1;
  41. }
  42. if (memcmp(buf, "WARC/1.0\r\n", 10) == 0 || memcmp(buf, "WARC/1.1\r\n", 10) == 0) {
  43. // Got some headers; find the record type, content length, and end of headers
  44. m0 = memmem(buf, n, "\r\nContent-Length:", 17);
  45. if (!m0) {
  46. fprintf(stderr, "Error: Content-Length missing");
  47. return 1;
  48. }
  49. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
  50. m1 = memmem(m0 + 1, n - (m0 + 1 - buf), "\r\n", 2);
  51. if (!m1) {
  52. fprintf(stderr, "Error: CRLF after Content-Length missing");
  53. return 1;
  54. }
  55. m0 += 17;
  56. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  57. if (!sscanf(m0, "%zu%n", &record_length, &nscan)) {
  58. fprintf(stderr, "Error: invalid Content-Length");
  59. return 1;
  60. }
  61. if (nscan > n - (m0 - buf)) {
  62. fprintf(stderr, "Error: buffer overread");
  63. return 1;
  64. }
  65. m0 += nscan;
  66. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  67. if (m0 != m1) {
  68. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)");
  69. return 1;
  70. }
  71. DEBUG_PRINTF("Record body length: %zu\n", record_length);
  72. m0 = memmem(buf, n, "\r\nWARC-Type:", 12);
  73. if (!m0) {
  74. fprintf(stderr, "Error: WARC-Type missing");
  75. return 1;
  76. }
  77. DEBUG_PRINTF("Found WARC-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - buf);
  78. m1 = memmem(m0, n - (m0 - buf), "\r\n", 2);
  79. if (!m1) {
  80. fprintf(stderr, "Error: CRLF after WARC-Type missing");
  81. return 1;
  82. }
  83. m0 += 12;
  84. while (m0 < buf + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  85. if (memcmp(m0, "response", 8) == 0) {
  86. DEBUG_PRINTF("Response record\n");
  87. state = STATE_RESPONSE_RECORD;
  88. } else {
  89. DEBUG_PRINTF("Other record\n");
  90. state = STATE_OTHER_RECORD;
  91. }
  92. m0 = memmem(buf, n, "\r\n\r\n", 4);
  93. if (!m0) {
  94. fprintf(stderr, "Error: end of headers not found");
  95. return 1;
  96. }
  97. m0 += 4;
  98. DEBUG_PRINTF("Record body begins at %p (offset %zu)\n", (void*)m0, m0 - buf);
  99. //TODO Replace all the memmove business with pointer logic to avoid needless constant memory copying (is more wrooom).
  100. DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (m0 - buf), m0 - buf);
  101. memmove(buf, m0, n - (m0 - buf));
  102. n = n - (m0 - buf);
  103. record_bytes_read = 0;
  104. goto checkstate;
  105. // Previous code for this case, but the state handling below should do the same thing with less duplication
  106. /*
  107. if (record_length + 4 > n - (m0 - buf)) {
  108. // The whole record isn't in this buffer, print what we have and continue with the loop.
  109. if (state == STATE_RESPONSE_RECORD) {
  110. fwrite(m0, 1, n - (m0 - buf), stdout);
  111. }
  112. record_bytes_read = n - (m0 - buf);
  113. } else {
  114. // The buffer contains the entire record. Print it, then adjust buf and go back to the beginning of the state processing.
  115. if (state == STATE_RESPONSE_RECORD) {
  116. fwrite(m0, 1, record_length, stdout);
  117. }
  118. if (memcmp(m0 + record_length, "\r\n\r\n", 4) != 0) {
  119. fprintf(stderr, "Error: end of block not found");
  120. return 1;
  121. }
  122. m0 += record_length + 4;
  123. memmove(buf, m0, n - (m0 - buf));
  124. n = n - (m0 - buf);
  125. // Fill the buffer back up to ensure that there's at least BUFSIZE bytes in the buffer so the next iteration has the full headers of the next record
  126. if (n < BUFSIZE) {
  127. n += fread(buf + n, 1, BUFSIZE, stdin);
  128. }
  129. state = STATE_BEFORE_RECORD;
  130. goto checkstate;
  131. }
  132. */
  133. } else {
  134. fprintf(stderr, "Error: expected header line, got something else\n");
  135. return 1;
  136. }
  137. } else if (state == STATE_RESPONSE_RECORD || state == STATE_OTHER_RECORD) {
  138. if (record_length + 4 - record_bytes_read > n) {
  139. // Only got part of the record body
  140. DEBUG_PRINTF("Partial record\n");
  141. if (state == STATE_RESPONSE_RECORD) {
  142. DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
  143. fwrite(buf, 1, n, stdout);
  144. }
  145. record_bytes_read += n;
  146. DEBUG_PRINTF("%zu of %zu bytes from this record written\n", record_bytes_read, record_length);
  147. } else {
  148. // Remainder of the record is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  149. DEBUG_PRINTF("Full record\n");
  150. if (state == STATE_RESPONSE_RECORD) {
  151. DEBUG_PRINTF("Copying %zu bytes to stdout\n", record_length - record_bytes_read);
  152. fwrite(buf, 1, record_length - record_bytes_read, stdout);
  153. fprintf(stdout, "\n");
  154. }
  155. if (memcmp(buf + record_length - record_bytes_read, "\r\n\r\n", 4) != 0) {
  156. fprintf(stderr, "Error: end of block not found");
  157. return 1;
  158. }
  159. DEBUG_PRINTF("Moving %zu bytes backwards by %zu\n", n - (record_length + 4 - record_bytes_read), record_length + 4 - record_bytes_read);
  160. memmove(buf, buf + record_length + 4 - record_bytes_read, n - (record_length + 4 - record_bytes_read));
  161. n = n - (record_length + 4 - record_bytes_read);
  162. if (n < BUFSIZE) {
  163. DEBUG_PRINTF("Refilling buffer\n");
  164. n += fread(buf + n, 1, BUFSIZE, stdin);
  165. }
  166. state = STATE_BEFORE_RECORD;
  167. goto checkstate;
  168. }
  169. }
  170. }
  171. }