The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

304 lines
10 KiB

  1. #define _GNU_SOURCE
  2. #include <ctype.h>
  3. #include <stdbool.h>
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #ifndef BUFSIZE
  8. #define BUFSIZE 1048576
  9. #endif
  10. #define STATE_HEADERS 0
  11. #define STATE_BODY 1 // Body with a Content-Length header
  12. #define STATE_CHUNK_LINE 2
  13. #define STATE_CHUNK_CONTENTS 3
  14. #ifdef DEBUG
  15. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  16. #else
  17. #define DEBUG_PRINTF(...) do {} while (false)
  18. #endif
  19. int main(int argc, char* argv[]) {
  20. // Read stdin, decode HTTP responses, dump all bodies to stdout.
  21. // stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses).
  22. // One LF is inserted at the end of each response to ensure that a new response always begins on a new line.
  23. // If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF.
  24. // Headers and chunk lines must fit into BUFSIZE.
  25. // Does not fully comply with the HTTP spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
  26. char buf[2 * BUFSIZE];
  27. size_t n;
  28. int state = STATE_HEADERS;
  29. char* bufp;
  30. char* m0;
  31. char* m1;
  32. char* eoh;
  33. size_t nscan;
  34. size_t bytes_read;
  35. size_t length;
  36. bool html_fake_base = false;
  37. char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below.
  38. size_t urllen;
  39. if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) {
  40. html_fake_base = true;
  41. }
  42. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  43. bufp = buf;
  44. checkstate:
  45. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
  46. DEBUG_PRINTF("Beginning of buffer: ");
  47. for (int i = 0; i < (n > 64 ? 64 : n); ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF);
  48. DEBUG_PRINTF("\n");
  49. if (n == 0) {
  50. break;
  51. }
  52. DEBUG_PRINTF("State: %d\n", state);
  53. if (state == STATE_HEADERS) {
  54. if (n < 9) {
  55. fprintf(stderr, "Error: too little data before HTTP headers\n");
  56. return 1;
  57. }
  58. // Handle optional URL + Length line
  59. url = NULL;
  60. if (memcmp(bufp, "HTTP/1.1 ", 9) != 0) {
  61. DEBUG_PRINTF("No HTTP header, looking for URL line\n");
  62. m0 = memmem(bufp, n, "\n", 1);
  63. if (!m0 || m0 == bufp) {
  64. fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n");
  65. return 1;
  66. }
  67. m1 = m0;
  68. // Skip over length field, which we don't need.
  69. --m0;
  70. while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0;
  71. if (*m0 != ' ') {
  72. fprintf(stderr, "Error: URL line has unexpected format\n");
  73. return 1;
  74. }
  75. // Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace.
  76. url = bufp;
  77. urllen = m0 - bufp;
  78. if (!memmem(url, urllen, "://", 3)) {
  79. fprintf(stderr, "Error: URL line contains no scheme\n");
  80. return 1;
  81. }
  82. m0 = url;
  83. while (m0 < bufp + urllen && *m0 != '\r' && *m0 != '\n' && *m0 != ' ' && *m0 != '\t') ++m0;
  84. if (m0 != bufp + urllen) {
  85. fprintf(stderr, "Error: URL contains CR, LF, or whitespace\n");
  86. return 1;
  87. }
  88. DEBUG_PRINTF("Found URL: ");
  89. for (int i = 0; i < (urllen > 64 ? 64 : urllen); ++i) DEBUG_PRINTF(isprint(*(url + i)) ? "%c" : "\\x%02x", *(url + i) & 0xFF);
  90. if (urllen > 64) DEBUG_PRINTF("<...>");
  91. DEBUG_PRINTF("\n");
  92. // Skip over URL line and continue processing below
  93. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 1 - bufp);
  94. n = n - (m1 + 1 - bufp);
  95. bufp = m1 + 1;
  96. }
  97. if (html_fake_base && !url) {
  98. fprintf(stderr, "Error: --html-fake-base requires URL lines\n");
  99. return 1;
  100. }
  101. if (n < 9) {
  102. fprintf(stderr, "Error: too little data before HTTP headers\n");
  103. return 1;
  104. }
  105. if (memcmp(bufp, "HTTP/1.1 ", 9) == 0) {
  106. // Got some headers; find transfer encoding, content length, and end of headers
  107. eoh = memmem(bufp, n, "\r\n\r\n", 4);
  108. if (!eoh) {
  109. fprintf(stderr, "Error: end of headers not found\n");
  110. return 1;
  111. }
  112. eoh += 4;
  113. DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);
  114. m0 = memmem(bufp, n, "\r\nContent-Length:", 17);
  115. if (m0 && m0 < eoh) {
  116. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  117. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  118. if (!m1) {
  119. fprintf(stderr, "Error: CRLF after Content-Length missing\n");
  120. return 1;
  121. }
  122. m0 += 17;
  123. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  124. if (!sscanf(m0, "%zu%n", &length, &nscan)) {
  125. fprintf(stderr, "Error: invalid Content-Length\n");
  126. return 1;
  127. }
  128. if (nscan > n - (m0 - bufp)) {
  129. fprintf(stderr, "Error: buffer overread\n");
  130. return 1;
  131. }
  132. m0 += nscan;
  133. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  134. if (m0 != m1) {
  135. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  136. return 1;
  137. }
  138. DEBUG_PRINTF("Content length: %zu\n", length);
  139. state = STATE_BODY;
  140. } else {
  141. m0 = memmem(bufp, n, "\r\nTransfer-Encoding:", 20);
  142. if (!m0 || m0 >= eoh) {
  143. fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing\n");
  144. return 1;
  145. }
  146. DEBUG_PRINTF("Found Transfer-Encoding header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  147. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  148. if (!m1 || m1 >= eoh - 2) {
  149. fprintf(stderr, "Error: CRLF after Transfer-Encoding missing\n");
  150. return 1;
  151. }
  152. m0 += 20;
  153. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  154. if (memcmp(m0, "chunked", 7) != 0) {
  155. fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
  156. return 1;
  157. }
  158. m0 += 7;
  159. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  160. if (m0 != m1) {
  161. fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
  162. return 1;
  163. }
  164. DEBUG_PRINTF("Chunked transfer encoding\n");
  165. state = STATE_CHUNK_LINE;
  166. }
  167. if (html_fake_base) {
  168. m0 = memmem(bufp, n, "\r\nContent-Type:", 15);
  169. if (m0 && m0 < eoh) {
  170. DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  171. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  172. if (!m1) {
  173. fprintf(stderr, "Error: CRLF after Content-Type missing\n");
  174. return 1;
  175. }
  176. m0 += 15;
  177. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  178. if (memcmp(m0, "text/html", 9) == 0) {
  179. DEBUG_PRINTF("Is HTML response, inserting fake base tag\n");
  180. fprintf(stdout, "<base href=\"");
  181. fwrite(url, 1, urllen, stdout);
  182. fprintf(stdout, "\">\n");
  183. }
  184. }
  185. }
  186. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp);
  187. n = n - (eoh - bufp);
  188. bufp = eoh;
  189. bytes_read = 0;
  190. goto checkstate;
  191. } else {
  192. fprintf(stderr, "Error: expected header line, got something else\n");
  193. return 1;
  194. }
  195. } else if (state == STATE_BODY || state == STATE_CHUNK_CONTENTS) {
  196. if (length + (state == STATE_BODY ? 1 : 2) - bytes_read > n) {
  197. // Only got part of the body
  198. DEBUG_PRINTF("Partial body\n");
  199. DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
  200. fwrite(bufp, 1, n, stdout);
  201. bytes_read += n;
  202. DEBUG_PRINTF("%zu of %zu bytes from this response written\n", bytes_read, length);
  203. } else {
  204. // Remainder of the response is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  205. DEBUG_PRINTF("Full body\n");
  206. DEBUG_PRINTF("Copying %zu bytes to stdout\n", length - bytes_read);
  207. fwrite(bufp, 1, length - bytes_read, stdout);
  208. fprintf(stdout, "\n");
  209. if (state == STATE_CHUNK_CONTENTS && *(bufp + length - bytes_read) == '\r') {
  210. // Stupid hack to enforce the CRLF
  211. ++length;
  212. }
  213. if (memcmp(bufp + length - bytes_read, "\n", 1) != 0) {
  214. fprintf(stderr, "Error: end of HTTP body not found\n");
  215. return 1;
  216. }
  217. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", length + 1 - bytes_read);
  218. n = n - (length + 1 - bytes_read);
  219. bufp = bufp + length + 1 - bytes_read;
  220. if (n < BUFSIZE) {
  221. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  222. memmove(buf, bufp, n);
  223. bufp = buf;
  224. n += fread(buf + n, 1, BUFSIZE, stdin);
  225. }
  226. if (state == STATE_BODY) {
  227. state = STATE_HEADERS;
  228. } else {
  229. state = STATE_CHUNK_LINE;
  230. }
  231. goto checkstate;
  232. }
  233. } else if (state == STATE_CHUNK_LINE) {
  234. m1 = memmem(bufp, n, "\r\n", 2);
  235. if (!m1) {
  236. fprintf(stderr, "Error: chunk line EOL missing\n");
  237. return 1;
  238. }
  239. m0 = bufp;
  240. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  241. if (!sscanf(m0, "%x%n", &length, &nscan)) {
  242. fprintf(stderr, "Error: invalid chunk length\n");
  243. return 1;
  244. }
  245. if (nscan > n - (m0 - bufp)) {
  246. fprintf(stderr, "Error: buffer overread\n");
  247. return 1;
  248. }
  249. m0 += nscan;
  250. while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
  251. if (*m0 != ';' && m0 != m1) {
  252. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  253. return 1;
  254. }
  255. DEBUG_PRINTF("Chunk length: %zu bytes\n", length);
  256. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 2 - bufp);
  257. n = n - (m1 + 2 - bufp);
  258. bufp = m1 + 2;
  259. if (length == 0) {
  260. // End of response, must be followed by CRLF + LF
  261. if (n < 3) {
  262. fprintf(stderr, "Error: buffer exhausted while looking for empty chunk CRLF\n");
  263. return 1;
  264. }
  265. if (*(m1 + 2) != '\r' || *(m1 + 3) != '\n' || *(m1 + 4) != '\n') {
  266. fprintf(stderr, "Error: end of HTTP body not found\n");
  267. return 1;
  268. }
  269. n -= 3;
  270. bufp += 3;
  271. state = STATE_HEADERS;
  272. } else {
  273. state = STATE_CHUNK_CONTENTS;
  274. }
  275. if (n < BUFSIZE) {
  276. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  277. memmove(buf, bufp, n);
  278. bufp = buf;
  279. n += fread(buf + n, 1, BUFSIZE, stdin);
  280. }
  281. goto checkstate;
  282. }
  283. }
  284. if (state != STATE_HEADERS) {
  285. fprintf(stderr, "Error: incomplete body at the end of input\n");
  286. return 1;
  287. }
  288. }