The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

390 lines
13 KiB

  1. #define _GNU_SOURCE
  2. #include <ctype.h>
  3. #include <stdbool.h>
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #ifndef BUFSIZE
  8. #define BUFSIZE 1048576
  9. #endif
  10. #define STATE_HEADERS 0
  11. #define STATE_BODY 1 // Body with a Content-Length header
  12. #define STATE_CHUNK_LINE 2
  13. #define STATE_CHUNK_CONTENTS 3
  14. #ifdef DEBUG
  15. #define DEBUG_PRINTF(...) do { fprintf(stderr, __VA_ARGS__); } while (false)
  16. #else
  17. #define DEBUG_PRINTF(...) do {} while (false)
  18. #endif
  19. char* memcasemem(char* haystack, size_t haystacklen, char* needle, size_t needlelen) {
  20. // Case-insensitive (for ASCII) slower version of memmem
  21. // needle must already be in lower-case.
  22. if (needlelen > haystacklen) {
  23. // A needle longer than the haystack can never be in there.
  24. return NULL;
  25. }
  26. char* p1;
  27. char* p2;
  28. char* p3;
  29. for (p1 = haystack; p1 < haystack + haystacklen - needlelen; ++p1) {
  30. if (tolower((unsigned char)*p1) == *needle) {
  31. // Found a first char match, check the rest
  32. // No need to constrain p2; due to the needlelen>haystacklen check above and the limits on p1, p2 can never exceed the haystack.
  33. for (p2 = p1 + 1, p3 = needle + 1; p3 < needle + needlelen; p2++, p3++) {
  34. if (tolower((unsigned char)*p2) != *p3) {
  35. break;
  36. }
  37. }
  38. if (p3 == needle + needlelen) {
  39. // Full match
  40. return p1;
  41. }
  42. }
  43. }
  44. return NULL;
  45. }
  46. int main(int argc, char* argv[]) {
  47. // Read stdin, decode HTTP responses, dump all bodies to stdout.
  48. // stdin may contain an extra 'URL LENGTH\n' line before each response (--meta output from warc-dump-responses).
  49. // One LF is inserted at the end of each response to ensure that a new response always begins on a new line.
  50. // If --html-fake-base is provided and the input contains URL data, every HTML response (Content-Type: text/html header) is prefixed with one line containing a fake <base> tag: <base href="URL">. The line is terminated with a LF.
  51. // Headers and chunk lines must fit into BUFSIZE.
  52. // Does not fully comply with the HTTP spec. For example, continuation lines are unsupported.
  53. char buf[2 * BUFSIZE];
  54. size_t n;
  55. int state = STATE_HEADERS;
  56. char* bufp;
  57. char* m0;
  58. char* m1;
  59. char* eoh;
  60. long int nscan;
  61. size_t bytes_read = 0;
  62. size_t length;
  63. bool html_fake_base = false;
  64. char* url = NULL; // Warning, pointer is only valid within the STATE_HEADERS block below.
  65. size_t urllen;
  66. bool have_response_length = false;
  67. size_t response_length;
  68. if (argc == 2 && strcmp(argv[1], "--html-fake-base") == 0) {
  69. html_fake_base = true;
  70. }
  71. while ((n = fread(buf, 1, BUFSIZE, stdin)) > 0) {
  72. bufp = buf;
  73. checkstate:
  74. DEBUG_PRINTF("Have %zu bytes of buffer (at %p)\n", n, (void*)bufp);
  75. DEBUG_PRINTF("Beginning of buffer: ");
  76. for (unsigned int i = 0; i < (n > 64 ? 64 : n); ++i) DEBUG_PRINTF(isprint(*(bufp + i)) ? "%c" : "\\x%02x", *(bufp + i) & 0xFF);
  77. DEBUG_PRINTF("\n");
  78. if (n == 0) {
  79. break;
  80. }
  81. DEBUG_PRINTF("State: %d\n", state);
  82. if (state == STATE_HEADERS) {
  83. if (n < 9) {
  84. fprintf(stderr, "Error: too little data before HTTP headers\n");
  85. return 1;
  86. }
  87. // Handle URL + Length line; optional for HTTP/1.1 but required for HTTP/1.0
  88. url = NULL;
  89. urllen = 0;
  90. response_length = 0;
  91. if (memcmp(bufp, "HTTP/1.1 ", 9) != 0 && memcmp(bufp, "HTTP/1.0 ", 9) != 0) {
  92. DEBUG_PRINTF("No HTTP header, looking for URL line\n");
  93. m0 = memmem(bufp, n, "\n", 1);
  94. if (!m0 || m0 == bufp) {
  95. fprintf(stderr, "Error: expected HTTP headers or URL line, got neither\n");
  96. return 1;
  97. }
  98. m1 = m0;
  99. // Skip back over length field
  100. --m0;
  101. while (m0 > bufp && '0' <= *m0 && *m0 <= '9') --m0;
  102. if (*m0 != ' ') {
  103. fprintf(stderr, "Error: URL line has unexpected format\n");
  104. return 1;
  105. }
  106. // Read length
  107. if (sscanf(m0, " %zu%ln", &response_length, &nscan) <= 0) {
  108. fprintf(stderr, "Error: URL line contains no length\n");
  109. return 1;
  110. }
  111. if (nscan != m1 - m0) {
  112. fprintf(stderr, "Error: URL line length read mismatch\n");
  113. return 1;
  114. }
  115. have_response_length = true;
  116. DEBUG_PRINTF("Response length: %zu\n", response_length);
  117. // Rest must now be the URL; check that there is a scheme and no CR, LF, or whitespace.
  118. url = bufp;
  119. urllen = m0 - bufp;
  120. if (!memmem(url, urllen, "://", 3)) {
  121. fprintf(stderr, "Error: URL line contains no scheme\n");
  122. return 1;
  123. }
  124. m0 = url;
  125. while (m0 < bufp + urllen && *m0 != '\r' && *m0 != '\n' && *m0 != ' ' && *m0 != '\t') ++m0;
  126. if (m0 != bufp + urllen) {
  127. fprintf(stderr, "Error: URL contains CR, LF, or whitespace\n");
  128. return 1;
  129. }
  130. DEBUG_PRINTF("Found URL: ");
  131. for (unsigned int i = 0; i < (urllen > 64 ? 64 : urllen); ++i) DEBUG_PRINTF(isprint(*(url + i)) ? "%c" : "\\x%02x", *(url + i) & 0xFF);
  132. if (urllen > 64) DEBUG_PRINTF("<...>");
  133. DEBUG_PRINTF("\n");
  134. // Skip over URL line and continue processing below
  135. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 1 - bufp);
  136. n = n - (m1 + 1 - bufp);
  137. bufp = m1 + 1;
  138. }
  139. if (html_fake_base && !url) {
  140. fprintf(stderr, "Error: --html-fake-base requires URL lines\n");
  141. return 1;
  142. }
  143. if (memcmp(bufp, "HTTP/1.0 ", 9) == 0 && !have_response_length) {
  144. fprintf(stderr, "Error: HTTP/1.0 requires URL metadata lines\n");
  145. return 1;
  146. }
  147. if (n < 9) {
  148. fprintf(stderr, "Error: too little data before HTTP headers\n");
  149. return 1;
  150. }
  151. if (memcmp(bufp, "HTTP/1.1 ", 9) == 0 || memcmp(bufp, "HTTP/1.0 ", 9) == 0) {
  152. // Got some headers; find transfer encoding, content length, and end of headers
  153. m0 = memmem(bufp, n, "\r\n\r\n", 4);
  154. m1 = memmem(bufp, n, "\n\n", 2);
  155. if (m0 && m1) {
  156. eoh = (m0 < m1 ? m0 + 4 : m1 + 2);
  157. } else if (m0) {
  158. eoh = m0 + 4;
  159. } else if (m1) {
  160. eoh = m1 + 2;
  161. } else {
  162. eoh = NULL;
  163. }
  164. if (!eoh) {
  165. fprintf(stderr, "Error: end of headers not found\n");
  166. return 1;
  167. }
  168. DEBUG_PRINTF("Response body begins at %p (offset %zu)\n", (void*)eoh, eoh - bufp);
  169. if (memcmp(bufp, "HTTP/1.0 ", 9) == 0) {
  170. // HTTP 1.0 doesn't have TE, so just use the response_length for the content length and skip the other header parsing
  171. if (bufp + response_length < eoh) {
  172. fprintf(stderr, "Error: end of headers occurs after alleged response length\n");
  173. return 1;
  174. }
  175. length = response_length - (eoh - bufp);
  176. state = STATE_BODY;
  177. } else if ((m0 = memcasemem(bufp, eoh - bufp, "\ncontent-length:", 16)) && m0 < eoh) {
  178. DEBUG_PRINTF("Found Content-Length header at %p (offset %zu)\n", (void*)(m0 + 2), m0 + 2 - bufp);
  179. m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
  180. if (!m1) {
  181. fprintf(stderr, "Error: CRLF after Content-Length missing\n");
  182. return 1;
  183. }
  184. if (*(m1 - 1) == '\r') --m1;
  185. m0 += 16;
  186. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  187. if (sscanf(m0, "%zu%ln", &length, &nscan) <= 0) {
  188. fprintf(stderr, "Error: invalid Content-Length\n");
  189. return 1;
  190. }
  191. if (nscan < 0) {
  192. fprintf(stderr, "Error: negative nscan\n");
  193. return 1;
  194. }
  195. if (m0 + nscan > bufp + n) {
  196. fprintf(stderr, "Error: buffer overread\n");
  197. return 1;
  198. }
  199. m0 += nscan;
  200. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  201. if (m0 != m1) {
  202. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  203. return 1;
  204. }
  205. DEBUG_PRINTF("Content length: %zu\n", length);
  206. state = STATE_BODY;
  207. } else {
  208. m0 = memcasemem(bufp, eoh - bufp, "\ntransfer-encoding:", 19);
  209. if (!m0 || m0 >= eoh) {
  210. DEBUG_PRINTF("No Content-Length or Transfer-Encoding, falling back to response length\n");
  211. if (!have_response_length) {
  212. fprintf(stderr, "Error: Content-Length and Transfer-Encoding missing and no response length from metadata line\n");
  213. return 1;
  214. }
  215. if (bufp + response_length < eoh) {
  216. fprintf(stderr, "Error: end of headers occurs after alleged response length\n");
  217. return 1;
  218. }
  219. length = response_length - (eoh - bufp);
  220. state = STATE_BODY;
  221. } else {
  222. DEBUG_PRINTF("Found Transfer-Encoding header at %p (offset %zu)\n", (void*)(m0 + 1), m0 + 1 - bufp);
  223. m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
  224. if (!m1 || m1 >= eoh - 1) {
  225. fprintf(stderr, "Error: CRLF after Transfer-Encoding missing\n");
  226. return 1;
  227. }
  228. m0 += 19;
  229. if (*(m1 - 1) == '\r') --m1;
  230. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  231. if (memcmp(m0, "chunked", 7) != 0) {
  232. fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
  233. return 1;
  234. }
  235. m0 += 7;
  236. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  237. if (m0 != m1) {
  238. fprintf(stderr, "Error: unsupported Transfer-Encoding\n");
  239. return 1;
  240. }
  241. DEBUG_PRINTF("Chunked transfer encoding\n");
  242. state = STATE_CHUNK_LINE;
  243. }
  244. }
  245. if (html_fake_base) {
  246. m0 = memcasemem(bufp, eoh - bufp, "\ncontent-type:", 14);
  247. if (m0 && m0 < eoh) {
  248. DEBUG_PRINTF("Found Content-Type header at %p (offset %zu)\n", (void*)(m0 + 1), m0 + 1 - bufp);
  249. m1 = memmem(m0 + 1, eoh - (m0 + 1), "\n", 1);
  250. if (!m1) {
  251. fprintf(stderr, "Error: CRLF after Content-Type missing\n");
  252. return 1;
  253. }
  254. m0 += 14;
  255. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  256. if (memcmp(m0, "text/html", 9) == 0) {
  257. DEBUG_PRINTF("Is HTML response, inserting fake base tag\n");
  258. fprintf(stdout, "<base href=\"");
  259. fwrite(url, 1, urllen, stdout);
  260. fprintf(stdout, "\">\n");
  261. }
  262. }
  263. }
  264. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", eoh - bufp);
  265. n = n - (eoh - bufp);
  266. bufp = eoh;
  267. bytes_read = 0;
  268. goto checkstate;
  269. } else {
  270. fprintf(stderr, "Error: expected header line, got something else\n");
  271. return 1;
  272. }
  273. } else if (state == STATE_BODY || state == STATE_CHUNK_CONTENTS) {
  274. if (length + (state == STATE_BODY ? 1 : 2) - bytes_read > n) {
  275. // Only got part of the body
  276. DEBUG_PRINTF("Partial body\n");
  277. DEBUG_PRINTF("Copying %zu bytes to stdout\n", n);
  278. fwrite(bufp, 1, n, stdout);
  279. bytes_read += n;
  280. DEBUG_PRINTF("%zu of %zu bytes from this response written\n", bytes_read, length);
  281. } else {
  282. // Remainder of the response is in the buffer. Same logic as above for small records fitting in the buffer with the headers.
  283. DEBUG_PRINTF("Full body\n");
  284. DEBUG_PRINTF("Copying %zu bytes to stdout\n", length - bytes_read);
  285. fwrite(bufp, 1, length - bytes_read, stdout);
  286. if (state == STATE_BODY) {
  287. fprintf(stdout, "\n");
  288. }
  289. if (state == STATE_CHUNK_CONTENTS && *(bufp + length - bytes_read) == '\r') {
  290. // Stupid hack to enforce the CRLF
  291. ++length;
  292. }
  293. if (memcmp(bufp + length - bytes_read, "\n", 1) != 0) {
  294. fprintf(stderr, "Error: end of HTTP body not found\n");
  295. return 1;
  296. }
  297. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", length + 1 - bytes_read);
  298. n = n - (length + 1 - bytes_read);
  299. bufp = bufp + length + 1 - bytes_read;
  300. if (n < BUFSIZE) {
  301. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  302. memmove(buf, bufp, n);
  303. bufp = buf;
  304. n += fread(buf + n, 1, BUFSIZE, stdin);
  305. }
  306. if (state == STATE_BODY) {
  307. state = STATE_HEADERS;
  308. } else {
  309. state = STATE_CHUNK_LINE;
  310. }
  311. goto checkstate;
  312. }
  313. } else if (state == STATE_CHUNK_LINE) {
  314. m1 = memmem(bufp, n, "\r\n", 2);
  315. if (!m1) {
  316. fprintf(stderr, "Error: chunk line EOL missing\n");
  317. return 1;
  318. }
  319. m0 = bufp;
  320. while (m0 < bufp + n && (*m0 == ' ' || *m0 == '\t')) ++m0;
  321. if (sscanf(m0, "%zx%ln", &length, &nscan) <= 0) {
  322. fprintf(stderr, "Error: invalid chunk length\n");
  323. return 1;
  324. }
  325. if (nscan < 0) {
  326. fprintf(stderr, "Error: negative nscan\n");
  327. return 1;
  328. }
  329. if (m0 + nscan > bufp + n) {
  330. fprintf(stderr, "Error: buffer overread\n");
  331. return 1;
  332. }
  333. m0 += nscan;
  334. while (m0 < m1 && (*m0 == ' ' || *m0 == '\t')) ++m0;
  335. if (*m0 != ';' && m0 != m1) {
  336. fprintf(stderr, "Error: invalid Content-Length (noise before EOL)\n");
  337. return 1;
  338. }
  339. DEBUG_PRINTF("Chunk length: %zu bytes\n", length);
  340. DEBUG_PRINTF("Adjusting buffer pointer and n by %zu\n", m1 + 2 - bufp);
  341. n = n - (m1 + 2 - bufp);
  342. bufp = m1 + 2;
  343. if (length == 0) {
  344. // End of response, must be followed by CRLF + LF
  345. if (n < 3) {
  346. fprintf(stderr, "Error: buffer exhausted while looking for empty chunk CRLF\n");
  347. return 1;
  348. }
  349. if (*(m1 + 2) != '\r' || *(m1 + 3) != '\n' || *(m1 + 4) != '\n') {
  350. fprintf(stderr, "Error: end of HTTP body not found\n");
  351. return 1;
  352. }
  353. fprintf(stdout, "\n");
  354. n -= 3;
  355. bufp += 3;
  356. state = STATE_HEADERS;
  357. } else {
  358. state = STATE_CHUNK_CONTENTS;
  359. }
  360. if (n < BUFSIZE) {
  361. DEBUG_PRINTF("Buffer too small (%zu bytes), moving and refilling\n", n);
  362. memmove(buf, bufp, n);
  363. bufp = buf;
  364. n += fread(buf + n, 1, BUFSIZE, stdin);
  365. }
  366. goto checkstate;
  367. }
  368. }
  369. if (state != STATE_HEADERS) {
  370. fprintf(stderr, "Error: incomplete body at the end of input\n");
  371. return 1;
  372. }
  373. }