The little things give you away... A collection of various small helper stuff
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

390 lignes
13 KiB

  1. #define _GNU_SOURCE
  2. #include <ctype.h>
  3. #include <stdbool.h>
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #ifndef BUFSIZE
  8. #define BUFSIZE 1048576
  9. #endif
  10. #define STATE_HEADERS 0
  11. #define STATE_BODY 1 // Body with a Content-Length header
  12. #define STATE_CHUNK_LINE 2
  13. #define STATE_CHUNK_CONTENTS 3
  14. #ifdef DEBUG
  15. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  16. #else
  17. #define DEBUG_PRINTF(...) do {} while (false)
  18. #endif
  19. char* memcasemem(char* haystack, size_t haystacklen, char* needle, size_t needlelen) {
  20. // Case-insensitive (for ASCII) slower version of memmem
  21. // needle must already be in lower-case.
  22. if (needlelen > haystacklen) {
  23. // A needle longer than the haystack can never be in there.
  24. return NULL;
  25. }
  26. char* p1;
  27. char* p2;
  28. char* p3;
  29. for (p1 = haystack; p1 < haystack + haystacklen - needlelen; ++p1) {
  30. if (tolower((unsigned char)*p1) == *needle) {
  31. // Found a first char match, check the rest
  32. // No need to constrain p2; due to the needlelen>haystacklen check above and the limits on p1, p2 can never exceed the haystack.
  33. for (p2 = p1 + 1, p3 = needle + 1; p3 < needle + needlelen; p2++, p3++) {
  34. if (tolower((unsigned char)*p2) != *p3) {
  35. break;
  36. }
  37. }
  38. if (p3 == needle + needlelen) {
  39. // Full match
  40. return p1;
  41. }
  42. }
  43. }
  44. return NULL;
  45. }
  46. int main(int argc, char* argv[]) {
  47. // Read stdin, decode HTTP responses, dump all bodies to stdout.
  48. // stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses).
  49. // One LF is inserted at the end of each response to ensure that a new response always begins on a new line.
  50. // If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF.
  51. // Headers and chunk lines must fit into BUFSIZE.
  52. // Does not fully comply with the HTTP spec. For example, continuation lines are unsupported.
  53. char buf[2 * BUFSIZE];
  54. size_t n;
  55. int state = STATE_HEADERS;
  56. char* bufp;
  57. char* m0;
  58. char* m1;
  59. char* eoh;
  60. long int nscan;
  61. size_t bytes_read = 0;
  62. size_t length;
  63. bool html_fake_base = false;
  64. char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below.
  65. size_t urllen;
  66. bool have_response_length = false;
  67. size_t response_length;
  68. if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) {
  69. html_fake_base = true;
  70. }
  71. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  72. bufp = buf;
  73. checkstate:
  74. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
  75. DEBUG_PRINTF("Beginning of buffer: ");
  76. for (unsigned int i = 0; i < (n > 64 ? 64 : n); ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF);
  77. DEBUG_PRINTF("\n");
  78. if (n == 0) {
  79. break;
  80. }
  81. DEBUG_PRINTF("State: %d\n", state);
  82. if (state == STATE_HEADERS) {
  83. if (n < 9) {
  84. fprintf(stderr, "Error: too little data before HTTP headers\n");
  85. return 1;
  86. }
  87. // Handle URL + Length line; optional for HTTP/1.1 but required for HTTP/1.0
  88. url = NULL;
  89. urllen = 0;
  90. response_length = 0;
  91. if (memcmp(bufp, "HTTP/1.1 ", 9) != 0 && memcmp(bufp, "HTTP/1.0 ", 9) != 0) {
  92. DEBUG_PRINTF("No HTTP header, looking for URL line\n");
  93. m0 = memmem(bufp, n, "\n", 1);
  94. if (!m0 || m0 == bufp) {
  95. fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n");
  96. return 1;
  97. }
  98. m1 = m0;
  99. // Skip back over length field
  100. --m0;
  101. while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0;
  102. if (*m0 != ' ') {
  103. fprintf(stderr, "Error: URL line has unexpected format\n");
  104. return 1;
  105. }
  106. // Read length
  107. if (sscanf(m0, " %zu%ln", &response_length, &nscan) <= 0) {
  108. fprintf(stderr, "Error: URL line contains no length\n");
  109. return 1;
  110. }
  111. if (nscan != m1 - m0) {
  112. fprintf(stderr, "Error: URL line length read mismatch\n");
  113. return 1;
  114. }
  115. have_response_length = true;
  116. DEBUG_PRINTF("Response length: %zu\n", response_length);
  117. // Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace.
  118. url = bufp;
  119. urllen = m0 - bufp;
  120. if (!memmem(url, urllen, "://", 3)) {
  121. fprintf(stderr, "Error: URL line contains no scheme\n");
  122. return 1;
  123. }
  124. m0 = url;
  125. while (m0 < bufp + urllen && *m0 != '\r' && *m0 != '\n' && *m0 != ' ' && *m0 != '\t') ++m0;
  126. if (m0 != bufp + urllen) {
  127. fprintf(stderr, "Error: URL contains CR, LF, or whitespace\n");
  128. return 1;
  129. }
  130. DEBUG_PRINTF("Found URL: ");
  131. for (unsigned int i = 0; i < (urllen > 64 ? 64 : urllen); ++i) DEBUG_PRINTF(isprint(*(url + i)) ? "%c" : "\\x%02x", *(url + i) & 0xFF);
  132. if (urllen > 64) DEBUG_PRINTF("<...>");
  133. DEBUG_PRINTF("\n");
  134. // Skip over URL line and continue processing below
  135. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 1 - bufp);
  136. n = n - (m1 + 1 - bufp);
  137. bufp = m1 + 1;
  138. }
  139. if (html_fake_base && !url) {
  140. fprintf(stderr, "Error: --html-fake-base requires URL lines\n");
  141. return 1;
  142. }
  143. if (memcmp(bufp, "HTTP/1.0 ", 9) == 0 && !have_response_length) {
  144. fprintf(stderr, "Error: HTTP/1.0 requires URL metadata lines\n");
  145. return 1;
  146. }
  147. if (n < 9) {
  148. fprintf(stderr, "Error: too little data before HTTP headers\n");
  149. return 1;
  150. }
  151. if (memcmp(bufp, "HTTP/1.1 ", 9) == 0 || memcmp(bufp, "HTTP/1.0 ", 9) == 0) {
  152. // Got some headers; find transfer encoding, content length, and end of headers
  153. m0 = memmem(bufp, n, "\r\n\r\n", 4);
  154. m1 = memmem(bufp, n, "\n\n", 2);
  155. if (m0 && m1) {
  156. eoh = (m0 < m1 ? m0 + 4 : m1 + 2);
  157. } else if (m0) {
  158. eoh = m0 + 4;
  159. } else if (m1) {
  160. eoh = m1 + 2;
  161. } else {
  162. eoh = NULL;
  163. }
  164. if (!eoh) {
  165. fprintf(stderr, "Error: end of headers not found\n");
  166. return 1;
  167. }
  168. DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);
  169. if (memcmp(bufp, "HTTP/1.0 ", 9) == 0) {
  170. // HTTP 1.0 doesn't have TE, so just use the response_length for the content length and skip the other header parsing
  171. if (bufp + response_length < eoh) {
  172. fprintf(stderr, "Error: end of headers occurs after alleged response length\n");
  173. return 1;
  174. }
  175. length = response_length - (eoh - bufp);
  176. state = STATE_BODY;
  177. } else if ((m0 = memcasemem(bufp, eoh - bufp, "\ncontent-length:", 16)) && m0 < eoh) {
  178. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  179. m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
  180. if (!m1) {
  181. fprintf(stderr, "Error: CRLF after Content-Length missing\n");
  182. return 1;
  183. }
  184. if (*(m1 - 1) == '\r') --m1;
  185. m0 += 16;
  186. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  187. if (sscanf(m0, "%zu%ln", &length, &nscan) <= 0) {
  188. fprintf(stderr, "Error: invalid Content-Length\n");
  189. return 1;
  190. }
  191. if (nscan < 0) {
  192. fprintf(stderr, "Error: negative nscan\n");
  193. return 1;
  194. }
  195. if (m0 + nscan > bufp + n) {
  196. fprintf(stderr, "Error: buffer overread\n");
  197. return 1;
  198. }
  199. m0 += nscan;
  200. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  201. if (m0 != m1) {
  202. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  203. return 1;
  204. }
  205. DEBUG_PRINTF("Content length: %zu\n", length);
  206. state = STATE_BODY;
  207. } else {
  208. m0 = memcasemem(bufp, eoh - bufp, "\ntransfer-encoding:", 19);
  209. if (!m0 || m0 >= eoh) {
  210. DEBUG_PRINTF("No Content-Length or Transfer-Encoding, falling back to response length\n");
  211. if (!have_response_length) {
  212. fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing and no response length from metadata line\n");
  213. return 1;
  214. }
  215. if (bufp + response_length < eoh) {
  216. fprintf(stderr, "Error: end of headers occurs after alleged response length\n");
  217. return 1;
  218. }
  219. length = response_length - (eoh - bufp);
  220. state = STATE_BODY;
  221. } else {
  222. DEBUG_PRINTF("Found Transfer-Encoding header at %p (offset %zu)\n", (void*)(m0 + 1), m0 + 1 - bufp);
  223. m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
  224. if (!m1 || m1 >= eoh - 1) {
  225. fprintf(stderr, "Error: CRLF after Transfer-Encoding missing\n");
  226. return 1;
  227. }
  228. m0 += 19;
  229. if (*(m1 - 1) == '\r') --m1;
  230. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  231. if (memcmp(m0, "chunked", 7) != 0) {
  232. fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
  233. return 1;
  234. }
  235. m0 += 7;
  236. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  237. if (m0 != m1) {
  238. fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
  239. return 1;
  240. }
  241. DEBUG_PRINTF("Chunked transfer encoding\n");
  242. state = STATE_CHUNK_LINE;
  243. }
  244. }
  245. if (html_fake_base) {
  246. m0 = memcasemem(bufp, eoh - bufp, "\ncontent-type:", 14);
  247. if (m0 && m0 < eoh) {
  248. DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 1), m0 + 1 - bufp);
  249. m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
  250. if (!m1) {
  251. fprintf(stderr, "Error: CRLF after Content-Type missing\n");
  252. return 1;
  253. }
  254. m0 += 14;
  255. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  256. if (memcmp(m0, "text/html", 9) == 0) {
  257. DEBUG_PRINTF("Is HTML response, inserting fake base tag\n");
  258. fprintf(stdout, "<base href=\"");
  259. fwrite(url, 1, urllen, stdout);
  260. fprintf(stdout, "\">\n");
  261. }
  262. }
  263. }
  264. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp);
  265. n = n - (eoh - bufp);
  266. bufp = eoh;
  267. bytes_read = 0;
  268. goto checkstate;
  269. } else {
  270. fprintf(stderr, "Error: expected header line, got something else\n");
  271. return 1;
  272. }
  273. } else if (state == STATE_BODY || state == STATE_CHUNK_CONTENTS) {
  274. if (length + (state == STATE_BODY ? 1 : 2) - bytes_read > n) {
  275. // Only got part of the body
  276. DEBUG_PRINTF("Partial body\n");
  277. DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
  278. fwrite(bufp, 1, n, stdout);
  279. bytes_read += n;
  280. DEBUG_PRINTF("%zu of %zu bytes from this response written\n", bytes_read, length);
  281. } else {
  282. // Remainder of the response is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  283. DEBUG_PRINTF("Full body\n");
  284. DEBUG_PRINTF("Copying %zu bytes to stdout\n", length - bytes_read);
  285. fwrite(bufp, 1, length - bytes_read, stdout);
  286. if (state == STATE_BODY) {
  287. fprintf(stdout, "\n");
  288. }
  289. if (state == STATE_CHUNK_CONTENTS && *(bufp + length - bytes_read) == '\r') {
  290. // Stupid hack to enforce the CRLF
  291. ++length;
  292. }
  293. if (memcmp(bufp + length - bytes_read, "\n", 1) != 0) {
  294. fprintf(stderr, "Error: end of HTTP body not found\n");
  295. return 1;
  296. }
  297. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", length + 1 - bytes_read);
  298. n = n - (length + 1 - bytes_read);
  299. bufp = bufp + length + 1 - bytes_read;
  300. if (n < BUFSIZE) {
  301. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  302. memmove(buf, bufp, n);
  303. bufp = buf;
  304. n += fread(buf + n, 1, BUFSIZE, stdin);
  305. }
  306. if (state == STATE_BODY) {
  307. state = STATE_HEADERS;
  308. } else {
  309. state = STATE_CHUNK_LINE;
  310. }
  311. goto checkstate;
  312. }
  313. } else if (state == STATE_CHUNK_LINE) {
  314. m1 = memmem(bufp, n, "\r\n", 2);
  315. if (!m1) {
  316. fprintf(stderr, "Error: chunk line EOL missing\n");
  317. return 1;
  318. }
  319. m0 = bufp;
  320. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  321. if (sscanf(m0, "%zx%ln", &length, &nscan) <= 0) {
  322. fprintf(stderr, "Error: invalid chunk length\n");
  323. return 1;
  324. }
  325. if (nscan < 0) {
  326. fprintf(stderr, "Error: negative nscan\n");
  327. return 1;
  328. }
  329. if (m0 + nscan > bufp + n) {
  330. fprintf(stderr, "Error: buffer overread\n");
  331. return 1;
  332. }
  333. m0 += nscan;
  334. while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
  335. if (*m0 != ';' && m0 != m1) {
  336. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  337. return 1;
  338. }
  339. DEBUG_PRINTF("Chunk length: %zu bytes\n", length);
  340. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 2 - bufp);
  341. n = n - (m1 + 2 - bufp);
  342. bufp = m1 + 2;
  343. if (length == 0) {
  344. // End of response, must be followed by CRLF + LF
  345. if (n < 3) {
  346. fprintf(stderr, "Error: buffer exhausted while looking for empty chunk CRLF\n");
  347. return 1;
  348. }
  349. if (*(m1 + 2) != '\r' || *(m1 + 3) != '\n' || *(m1 + 4) != '\n') {
  350. fprintf(stderr, "Error: end of HTTP body not found\n");
  351. return 1;
  352. }
  353. fprintf(stdout, "\n");
  354. n -= 3;
  355. bufp += 3;
  356. state = STATE_HEADERS;
  357. } else {
  358. state = STATE_CHUNK_CONTENTS;
  359. }
  360. if (n < BUFSIZE) {
  361. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  362. memmove(buf, bufp, n);
  363. bufp = buf;
  364. n += fread(buf + n, 1, BUFSIZE, stdin);
  365. }
  366. goto checkstate;
  367. }
  368. }
  369. if (state != STATE_HEADERS) {
  370. fprintf(stderr, "Error: incomplete body at the end of input\n");
  371. return 1;
  372. }
  373. }