The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

304 lignes
10 KiB

  1. #define _GNU_SOURCE
  2. #include <ctype.h>
  3. #include <stdbool.h>
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #ifndef BUFSIZE
  8. #define BUFSIZE 1048576
  9. #endif
  10. #define STATE_HEADERS 0
  11. #define STATE_BODY 1 // Body with a Content-Length header
  12. #define STATE_CHUNK_LINE 2
  13. #define STATE_CHUNK_CONTENTS 3
  14. #ifdef DEBUG
  15. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  16. #else
  17. #define DEBUG_PRINTF(...) do {} while (false)
  18. #endif
  19. int main(int argc, char* argv[]) {
  20. // Read stdin, decode HTTP responses, dump all bodies to stdout.
  21. // stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses).
  22. // One LF is inserted at the end of each response to ensure that a new response always begins on a new line.
  23. // If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF.
  24. // Headers and chunk lines must fit into BUFSIZE.
  25. // Does not fully comply with the HTTP spec. For example, headers must be capitalised canonically, and continuation lines are unsupported.
  26. char buf[2 * BUFSIZE];
  27. size_t n;
  28. int state = STATE_HEADERS;
  29. char* bufp;
  30. char* m0;
  31. char* m1;
  32. char* eoh;
  33. size_t nscan;
  34. size_t bytes_read;
  35. size_t length;
  36. bool html_fake_base = false;
  37. char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below.
  38. size_t urllen;
  39. if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) {
  40. html_fake_base = true;
  41. }
  42. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  43. bufp = buf;
  44. checkstate:
  45. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
  46. DEBUG_PRINTF("Beginning of buffer: ");
  47. for (int i = 0; i < (n > 64 ? 64 : n); ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF);
  48. DEBUG_PRINTF("\n");
  49. if (n == 0) {
  50. break;
  51. }
  52. DEBUG_PRINTF("State: %d\n", state);
  53. if (state == STATE_HEADERS) {
  54. if (n < 9) {
  55. fprintf(stderr, "Error: too little data before HTTP headers\n");
  56. return 1;
  57. }
  58. // Handle optional URL + Length line
  59. url = NULL;
  60. if (memcmp(bufp, "HTTP/1.1 ", 9) != 0) {
  61. DEBUG_PRINTF("No HTTP header, looking for URL line\n");
  62. m0 = memmem(bufp, n, "\n", 1);
  63. if (!m0 || m0 == bufp) {
  64. fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n");
  65. return 1;
  66. }
  67. m1 = m0;
  68. // Skip over length field, which we don't need.
  69. --m0;
  70. while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0;
  71. if (*m0 != ' ') {
  72. fprintf(stderr, "Error: URL line has unexpected format\n");
  73. return 1;
  74. }
  75. // Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace.
  76. url = bufp;
  77. urllen = m0 - bufp;
  78. if (!memmem(url, urllen, "://", 3)) {
  79. fprintf(stderr, "Error: URL line contains no scheme\n");
  80. return 1;
  81. }
  82. m0 = url;
  83. while (m0 < bufp + urllen && *m0 != '\r' && *m0 != '\n' && *m0 != ' ' && *m0 != '\t') ++m0;
  84. if (m0 != bufp + urllen) {
  85. fprintf(stderr, "Error: URL contains CR, LF, or whitespace\n");
  86. return 1;
  87. }
  88. DEBUG_PRINTF("Found URL: ");
  89. for (int i = 0; i < (urllen > 64 ? 64 : urllen); ++i) DEBUG_PRINTF(isprint(*(url + i)) ? "%c" : "\\x%02x", *(url + i) & 0xFF);
  90. if (urllen > 64) DEBUG_PRINTF("<...>");
  91. DEBUG_PRINTF("\n");
  92. // Skip over URL line and continue processing below
  93. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 1 - bufp);
  94. n = n - (m1 + 1 - bufp);
  95. bufp = m1 + 1;
  96. }
  97. if (html_fake_base && !url) {
  98. fprintf(stderr, "Error: --html-fake-base requires URL lines\n");
  99. return 1;
  100. }
  101. if (n < 9) {
  102. fprintf(stderr, "Error: too little data before HTTP headers\n");
  103. return 1;
  104. }
  105. if (memcmp(bufp, "HTTP/1.1 ", 9) == 0) {
  106. // Got some headers; find transfer encoding, content length, and end of headers
  107. eoh = memmem(bufp, n, "\r\n\r\n", 4);
  108. if (!eoh) {
  109. fprintf(stderr, "Error: end of headers not found\n");
  110. return 1;
  111. }
  112. eoh += 4;
  113. DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);
  114. m0 = memmem(bufp, n, "\r\nContent-Length:", 17);
  115. if (m0 && m0 < eoh) {
  116. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  117. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  118. if (!m1) {
  119. fprintf(stderr, "Error: CRLF after Content-Length missing\n");
  120. return 1;
  121. }
  122. m0 += 17;
  123. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  124. if (!sscanf(m0, "%zu%n", &length, &nscan)) {
  125. fprintf(stderr, "Error: invalid Content-Length\n");
  126. return 1;
  127. }
  128. if (nscan > n - (m0 - bufp)) {
  129. fprintf(stderr, "Error: buffer overread\n");
  130. return 1;
  131. }
  132. m0 += nscan;
  133. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  134. if (m0 != m1) {
  135. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  136. return 1;
  137. }
  138. DEBUG_PRINTF("Content length: %zu\n", length);
  139. state = STATE_BODY;
  140. } else {
  141. m0 = memmem(bufp, n, "\r\nTransfer-Encoding:", 20);
  142. if (!m0 || m0 >= eoh) {
  143. fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing\n");
  144. return 1;
  145. }
  146. DEBUG_PRINTF("Found Transfer-Encoding header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  147. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  148. if (!m1 || m1 >= eoh - 2) {
  149. fprintf(stderr, "Error: CRLF after Transfer-Encoding missing\n");
  150. return 1;
  151. }
  152. m0 += 20;
  153. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  154. if (memcmp(m0, "chunked", 7) != 0) {
  155. fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
  156. return 1;
  157. }
  158. m0 += 7;
  159. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  160. if (m0 != m1) {
  161. fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
  162. return 1;
  163. }
  164. DEBUG_PRINTF("Chunked transfer encoding\n");
  165. state = STATE_CHUNK_LINE;
  166. }
  167. if (html_fake_base) {
  168. m0 = memmem(bufp, n, "\r\nContent-Type:", 15);
  169. if (m0 && m0 < eoh) {
  170. DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  171. m1 = memmem(m0 + 1, n - (m0 + 1 - bufp), "\r\n", 2);
  172. if (!m1) {
  173. fprintf(stderr, "Error: CRLF after Content-Type missing\n");
  174. return 1;
  175. }
  176. m0 += 15;
  177. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  178. if (memcmp(m0, "text/html", 9) == 0) {
  179. DEBUG_PRINTF("Is HTML response, inserting fake base tag\n");
  180. fprintf(stdout, "<base href=\"");
  181. fwrite(url, 1, urllen, stdout);
  182. fprintf(stdout, "\">\n");
  183. }
  184. }
  185. }
  186. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp);
  187. n = n - (eoh - bufp);
  188. bufp = eoh;
  189. bytes_read = 0;
  190. goto checkstate;
  191. } else {
  192. fprintf(stderr, "Error: expected header line, got something else\n");
  193. return 1;
  194. }
  195. } else if (state == STATE_BODY || state == STATE_CHUNK_CONTENTS) {
  196. if (length + (state == STATE_BODY ? 1 : 2) - bytes_read > n) {
  197. // Only got part of the body
  198. DEBUG_PRINTF("Partial body\n");
  199. DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
  200. fwrite(bufp, 1, n, stdout);
  201. bytes_read += n;
  202. DEBUG_PRINTF("%zu of %zu bytes from this response written\n", bytes_read, length);
  203. } else {
  204. // Remainder of the response is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  205. DEBUG_PRINTF("Full body\n");
  206. DEBUG_PRINTF("Copying %zu bytes to stdout\n", length - bytes_read);
  207. fwrite(bufp, 1, length - bytes_read, stdout);
  208. fprintf(stdout, "\n");
  209. if (state == STATE_CHUNK_CONTENTS && *(bufp + length - bytes_read) == '\r') {
  210. // Stupid hack to enforce the CRLF
  211. ++length;
  212. }
  213. if (memcmp(bufp + length - bytes_read, "\n", 1) != 0) {
  214. fprintf(stderr, "Error: end of HTTP body not found\n");
  215. return 1;
  216. }
  217. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", length + 1 - bytes_read);
  218. n = n - (length + 1 - bytes_read);
  219. bufp = bufp + length + 1 - bytes_read;
  220. if (n < BUFSIZE) {
  221. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  222. memmove(buf, bufp, n);
  223. bufp = buf;
  224. n += fread(buf + n, 1, BUFSIZE, stdin);
  225. }
  226. if (state == STATE_BODY) {
  227. state = STATE_HEADERS;
  228. } else {
  229. state = STATE_CHUNK_LINE;
  230. }
  231. goto checkstate;
  232. }
  233. } else if (state == STATE_CHUNK_LINE) {
  234. m1 = memmem(bufp, n, "\r\n", 2);
  235. if (!m1) {
  236. fprintf(stderr, "Error: chunk line EOL missing\n");
  237. return 1;
  238. }
  239. m0 = bufp;
  240. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  241. if (!sscanf(m0, "%x%n", &length, &nscan)) {
  242. fprintf(stderr, "Error: invalid chunk length\n");
  243. return 1;
  244. }
  245. if (nscan > n - (m0 - bufp)) {
  246. fprintf(stderr, "Error: buffer overread\n");
  247. return 1;
  248. }
  249. m0 += nscan;
  250. while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
  251. if (*m0 != ';' && m0 != m1) {
  252. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  253. return 1;
  254. }
  255. DEBUG_PRINTF("Chunk length: %zu bytes\n", length);
  256. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 2 - bufp);
  257. n = n - (m1 + 2 - bufp);
  258. bufp = m1 + 2;
  259. if (length == 0) {
  260. // End of response, must be followed by CRLF + LF
  261. if (n < 3) {
  262. fprintf(stderr, "Error: buffer exhausted while looking for empty chunk CRLF\n");
  263. return 1;
  264. }
  265. if (*(m1 + 2) != '\r' || *(m1 + 3) != '\n' || *(m1 + 4) != '\n') {
  266. fprintf(stderr, "Error: end of HTTP body not found\n");
  267. return 1;
  268. }
  269. n -= 3;
  270. bufp += 3;
  271. state = STATE_HEADERS;
  272. } else {
  273. state = STATE_CHUNK_CONTENTS;
  274. }
  275. if (n < BUFSIZE) {
  276. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  277. memmove(buf, bufp, n);
  278. bufp = buf;
  279. n += fread(buf + n, 1, BUFSIZE, stdin);
  280. }
  281. goto checkstate;
  282. }
  283. }
  284. if (state != STATE_HEADERS) {
  285. fprintf(stderr, "Error: incomplete body at the end of input\n");
  286. return 1;
  287. }
  288. }