The little things give you away... A collection of various small helper stuff
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

193 lines
9.6 KiB

  1. // stdin: YouTube URLs or data with little noise besides that
  2. // stdout: lines for videos, channels, playlists, and unknown YouTube IDs found in the input, prefixed with v, c, p, and ?, respectively
  3. #include <stdbool.h>
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #ifndef DEBUG
  8. #define DEBUG 0
  9. #endif
  10. #define debug_print(fmt, ...) do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0)
  11. #define BUFFER_SIZE 1024 * 1024
  12. #define MAX_RESULT_SIZE 1024
  13. // MAX_RESULT_SIZE is the maximum length of an individual match. This must be smaller than BUFFER_SIZE/2.
  14. #define STATE_NONE 0
  15. #define STATE_ID 1
  16. #define STATE_SKIP_UNTIL_NONID 2
  17. #define IS_ID_CHAR(c) (('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == '-')
  18. inline bool is_upper_hex(char* c, size_t len) {
  19. for (size_t i = 0; i < len; i++) {
  20. if (!(('0' <= c[i] && c[i] <= '9') || ('A' <= c[i] && c[i] <= 'F')))
  21. return false;
  22. }
  23. return true;
  24. }
  25. int main(int argc, char** argv) {
  26. char* inbuf = malloc(sizeof(char) * BUFFER_SIZE);
  27. int state = 0;
  28. size_t stateStart = 0;
  29. size_t inbufEnd = 0;
  30. while (1) {
  31. debug_print("inbufEnd = %d, state = %d, stateStart = %d\n", inbufEnd, state, stateStart);
  32. debug_print("Reading %d from stdin\n", BUFFER_SIZE - inbufEnd);
  33. size_t readSize = fread(inbuf + inbufEnd, sizeof(char), BUFFER_SIZE - inbufEnd, stdin);
  34. debug_print("Got %d bytes\n", readSize);
  35. if (readSize == 0) {
  36. if (inbufEnd == 0) {
  37. // Nothing read, nothing left from previous iteration. Bye.
  38. break;
  39. } else {
  40. // No more input data but still something left from the previous read.
  41. // Make sure that the next character cannot be considered valid in any state (NUL qualifies), then let the code below handle things.
  42. inbuf[inbufEnd] = '\0';
  43. readSize += 1;
  44. }
  45. }
  46. for (size_t p = inbufEnd; p < inbufEnd + readSize; p++) {
  47. debug_print("p = %d, character = %c, state = %d, stateStart = %d\n", p, inbuf[p], state, stateStart);
  48. if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID && p - stateStart >= MAX_RESULT_SIZE) {
  49. debug_print("%s\n", "max result size exceeded, dropping result and switching to STATE_SKIP_UNTIL_NONID");
  50. state = STATE_SKIP_UNTIL_NONID;
  51. stateStart = 0;
  52. }
  53. switch (state) {
  54. case STATE_NONE:
  55. if (IS_ID_CHAR(inbuf[p])) {
  56. debug_print("%c is an ID char, switching to STATE_ID\n", inbuf[p]);
  57. state = STATE_ID;
  58. stateStart = p;
  59. }
  60. break;
  61. case STATE_ID:
  62. if (!IS_ID_CHAR(inbuf[p])) {
  63. debug_print("%c is not an ID char\n", inbuf[p]);
  64. if (p - stateStart >= 10) {
  65. debug_print("p = %d, stateStart = %d, got %d ID chars: %.*s\n", p, stateStart, p - stateStart, p - stateStart, inbuf + stateStart);
  66. if (p - stateStart == 11) {
  67. printf("v %.*s\n", p - stateStart, inbuf + stateStart);
  68. } else if (p - stateStart == 22) {
  69. printf("c %.*s\n", p - stateStart, inbuf + stateStart);
  70. } else if (p - stateStart == 24 && inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'C') {
  71. printf("c %.*s\n", p - stateStart, inbuf + stateStart);
  72. } else {
  73. // Playlist candidates, some of which contain IDs for channels or videos
  74. if (p - stateStart >= 19 && memcmp(inbuf + stateStart, "RDAMPL", 6) == 0) {
  75. // Playlist ID starts with RDAMPL, which is followed by a normal playlist ID, so skip that.
  76. stateStart += 6;
  77. }
  78. if (p - stateStart == 32 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 34)) {
  79. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  80. } else if ( (p - stateStart == 16 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 18))
  81. && is_upper_hex(inbuf + p - 16, 16)
  82. ) {
  83. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  84. } else if (p - stateStart == 24 && ( (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'U')
  85. || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'L')
  86. || (inbuf[stateStart] == 'F' && inbuf[stateStart + 1] == 'L')
  87. || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'U')
  88. )
  89. ) {
  90. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  91. printf("c %.*s\n", 22, inbuf + stateStart + 2);
  92. } else if (p - stateStart == 26 && memcmp(inbuf + stateStart, "UUSH", 4) == 0) {
  93. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  94. printf("c %.*s\n", 22, inbuf + stateStart + 4);
  95. } else if (p - stateStart == 13 && ( (inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D')
  96. || (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'L')
  97. || (inbuf[stateStart] == 'E' && inbuf[stateStart + 1] == 'L')
  98. || (inbuf[stateStart] == 'C' && inbuf[stateStart + 1] == 'L')
  99. || (inbuf[stateStart] == 'S' && inbuf[stateStart + 1] == 'L')
  100. || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'P')
  101. )
  102. ) {
  103. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  104. printf("v %.*s\n", 11, inbuf + stateStart + 2);
  105. } else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D'
  106. && ( (inbuf[stateStart + 2] == 'M' && inbuf[stateStart + 3] == 'M')
  107. || (inbuf[stateStart + 2] == 'Q' && inbuf[stateStart + 3] == 'M')
  108. || (inbuf[stateStart + 2] == 'E' && inbuf[stateStart + 3] == 'M')
  109. || (inbuf[stateStart + 2] == 'L' && inbuf[stateStart + 3] == 'V')
  110. || (inbuf[stateStart + 2] == 'H' && inbuf[stateStart + 3] == 'C')
  111. )
  112. ) {
  113. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  114. printf("v %.*s\n", 11, inbuf + stateStart + 4);
  115. } else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D'
  116. && '0' <= inbuf[stateStart + 2] && inbuf[stateStart + 2] <= '4'
  117. && '0' <= inbuf[stateStart + 3] && inbuf[stateStart + 3] <= '9'
  118. ) {
  119. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  120. printf("v %.*s\n", 11, inbuf + stateStart + 4);
  121. } else if ( p - stateStart == 28 && ( memcmp(inbuf + stateStart, "RDAMVM", 6) == 0
  122. || memcmp(inbuf + stateStart, "RDGMEM", 6) == 0
  123. )
  124. ) {
  125. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  126. } else if (p - stateStart == 28 && memcmp(inbuf + stateStart, "RDCMUC", 6) == 0) {
  127. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  128. printf("c %.*s\n", 24, inbuf + stateStart + 4);
  129. } else if ( p - stateStart == 41 && memcmp(inbuf + stateStart, "RDGMEM", 6) == 0
  130. && inbuf[stateStart + 28] == 'V' && inbuf[stateStart + 29] == 'M'
  131. ) {
  132. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  133. printf("v %.*s\n", 11, inbuf + stateStart + 30);
  134. } else if (p - stateStart == 26 && ( memcmp(inbuf + stateStart, "RDAO", 4) == 0
  135. || memcmp(inbuf + stateStart, "RDEM", 4) == 0
  136. || memcmp(inbuf + stateStart, "RDKM", 4) == 0
  137. )
  138. ) {
  139. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  140. } else if ( p - stateStart == 43 && 'k' <= inbuf[stateStart + 10] && inbuf[stateStart + 10] <= 'n'
  141. && ( memcmp(inbuf + stateStart, "RDCLAK5uy_", 10) == 0
  142. || memcmp(inbuf + stateStart, "RDTMAK5uy_", 10) == 0
  143. )
  144. ) {
  145. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  146. } else if ( p - stateStart == 41 && 'k' <= inbuf[stateStart + 8] && inbuf[stateStart + 8] <= 'n'
  147. && memcmp(inbuf + stateStart, "OLAK5uy_", 8) == 0
  148. ) {
  149. printf("p %.*s\n", p - stateStart, inbuf + stateStart);
  150. } else {
  151. printf("? %.*s\n", p - stateStart, inbuf + stateStart);
  152. }
  153. }
  154. }
  155. debug_print("%s\n", "Switching to STATE_NONE");
  156. state = STATE_NONE;
  157. stateStart = 0;
  158. }
  159. break;
  160. case STATE_SKIP_UNTIL_NONID:
  161. if (!IS_ID_CHAR(inbuf[p])) {
  162. debug_print("%s\n", "Switching to STATE_NONE");
  163. state = STATE_NONE;
  164. }
  165. break;
  166. }
  167. }
  168. if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID) {
  169. // Need to keep the trailing part of the buffer for the next iteration.
  170. // Because stateStart gets reset to zero when it exceeds MAX_RESULT_SIZE, inbufEnd + readSize - stateStart is guaranteed to be smaller than MAX_RESULT_SIZE.
  171. // Because MAX_RESULT_SIZE < BUFFER_SIZE/2, we can simply copy the last bytes to the beginning of inbuf directly.
  172. debug_print("Copying %d bytes starting from %d to the beginning of the buffer: %.*s\n",
  173. inbufEnd + readSize - stateStart, stateStart, inbufEnd + readSize - stateStart, inbuf + stateStart);
  174. memcpy(inbuf, inbuf + stateStart, inbufEnd + readSize - stateStart);
  175. inbufEnd += readSize - stateStart;
  176. stateStart = 0;
  177. } else {
  178. debug_print("%s\n", "No buffer copying necessary");
  179. inbufEnd = 0;
  180. }
  181. }
  182. }