|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- // stdin: YouTube URLs or data with little noise besides that
- // stdout: lines for videos, channels, playlists, and unknown YouTube IDs found in the input, prefixed with v, c, p, and ?, respectively
-
- #include <stdbool.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
-
- #ifndef DEBUG
- #define DEBUG 0
- #endif
- #define debug_print(fmt, ...) do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0)
-
- #define BUFFER_SIZE 1024 * 1024
- #define MAX_RESULT_SIZE 1024
- // MAX_RESULT_SIZE is the maximum length of an individual match. This must be smaller than BUFFER_SIZE/2.
-
- #define STATE_NONE 0
- #define STATE_ID 1
- #define STATE_SKIP_UNTIL_NONID 2
-
- #define IS_ID_CHAR(c) (('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == '-')
-
- inline bool is_upper_hex(char* c, size_t len) {
- for (size_t i = 0; i < len; i++) {
- if (!(('0' <= c[i] && c[i] <= '9') || ('A' <= c[i] && c[i] <= 'F')))
- return false;
- }
- return true;
- }
-
- int main(int argc, char** argv) {
- char* inbuf = malloc(sizeof(char) * BUFFER_SIZE);
- int state = 0;
- size_t stateStart = 0;
- size_t inbufEnd = 0;
-
- while (1) {
- debug_print("inbufEnd = %d, state = %d, stateStart = %d\n", inbufEnd, state, stateStart);
- debug_print("Reading %d from stdin\n", BUFFER_SIZE - inbufEnd);
- size_t readSize = fread(inbuf + inbufEnd, sizeof(char), BUFFER_SIZE - inbufEnd, stdin);
- debug_print("Got %d bytes\n", readSize);
- if (readSize == 0) {
- if (inbufEnd == 0) {
- // Nothing read, nothing left from previous iteration. Bye.
- break;
- } else {
- // No more input data but still something left from the previous read.
- // Make sure that the next character cannot be considered valid in any state (NUL qualifies), then let the code below handle things.
- inbuf[inbufEnd] = '\0';
- readSize += 1;
- }
- }
- for (size_t p = inbufEnd; p < inbufEnd + readSize; p++) {
- debug_print("p = %d, character = %c, state = %d, stateStart = %d\n", p, inbuf[p], state, stateStart);
- if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID && p - stateStart >= MAX_RESULT_SIZE) {
- debug_print("%s\n", "max result size exceeded, dropping result and switching to STATE_SKIP_UNTIL_NONID");
- state = STATE_SKIP_UNTIL_NONID;
- stateStart = 0;
- }
- switch (state) {
- case STATE_NONE:
- if (IS_ID_CHAR(inbuf[p])) {
- debug_print("%c is an ID char, switching to STATE_ID\n", inbuf[p]);
- state = STATE_ID;
- stateStart = p;
- }
- break;
-
- case STATE_ID:
- if (!IS_ID_CHAR(inbuf[p])) {
- debug_print("%c is not an ID char\n", inbuf[p]);
- if (p - stateStart >= 10) {
- debug_print("p = %d, stateStart = %d, got %d ID chars: %.*s\n", p, stateStart, p - stateStart, p - stateStart, inbuf + stateStart);
- if (p - stateStart == 11) {
- printf("v %.*s\n", p - stateStart, inbuf + stateStart);
- } else if (p - stateStart == 22) {
- printf("c %.*s\n", p - stateStart, inbuf + stateStart);
- } else if (p - stateStart == 24 && inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'C') {
- printf("c %.*s\n", p - stateStart, inbuf + stateStart);
- } else {
- // Playlist candidates, some of which contain IDs for channels or videos
- if (p - stateStart >= 19 && memcmp(inbuf + stateStart, "RDAMPL", 6) == 0) {
- // Playlist ID starts with RDAMPL, which is followed by a normal playlist ID, so skip that.
- stateStart += 6;
- }
- if (p - stateStart == 32 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 34)) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- } else if ( (p - stateStart == 16 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 18))
- && is_upper_hex(inbuf + p - 16, 16)
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- } else if (p - stateStart == 24 && ( (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'U')
- || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'L')
- || (inbuf[stateStart] == 'F' && inbuf[stateStart + 1] == 'L')
- || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'U')
- )
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- printf("c %.*s\n", 22, inbuf + stateStart + 2);
- } else if (p - stateStart == 26 && memcmp(inbuf + stateStart, "UUSH", 4) == 0) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- printf("c %.*s\n", 22, inbuf + stateStart + 4);
- } else if (p - stateStart == 13 && ( (inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D')
- || (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'L')
- || (inbuf[stateStart] == 'E' && inbuf[stateStart + 1] == 'L')
- || (inbuf[stateStart] == 'C' && inbuf[stateStart + 1] == 'L')
- || (inbuf[stateStart] == 'S' && inbuf[stateStart + 1] == 'L')
- || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'P')
- )
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- printf("v %.*s\n", 11, inbuf + stateStart + 2);
- } else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D'
- && ( (inbuf[stateStart + 2] == 'M' && inbuf[stateStart + 3] == 'M')
- || (inbuf[stateStart + 2] == 'Q' && inbuf[stateStart + 3] == 'M')
- || (inbuf[stateStart + 2] == 'E' && inbuf[stateStart + 3] == 'M')
- || (inbuf[stateStart + 2] == 'L' && inbuf[stateStart + 3] == 'V')
- || (inbuf[stateStart + 2] == 'H' && inbuf[stateStart + 3] == 'C')
- )
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- printf("v %.*s\n", 11, inbuf + stateStart + 4);
- } else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D'
- && '0' <= inbuf[stateStart + 2] && inbuf[stateStart + 2] <= '4'
- && '0' <= inbuf[stateStart + 3] && inbuf[stateStart + 3] <= '9'
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- printf("v %.*s\n", 11, inbuf + stateStart + 4);
- } else if ( p - stateStart == 28 && ( memcmp(inbuf + stateStart, "RDAMVM", 6) == 0
- || memcmp(inbuf + stateStart, "RDGMEM", 6) == 0
- )
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- } else if (p - stateStart == 28 && memcmp(inbuf + stateStart, "RDCMUC", 6) == 0) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- printf("c %.*s\n", 24, inbuf + stateStart + 4);
- } else if ( p - stateStart == 41 && memcmp(inbuf + stateStart, "RDGMEM", 6) == 0
- && inbuf[stateStart + 28] == 'V' && inbuf[stateStart + 29] == 'M'
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- printf("v %.*s\n", 11, inbuf + stateStart + 30);
- } else if (p - stateStart == 26 && ( memcmp(inbuf + stateStart, "RDAO", 4) == 0
- || memcmp(inbuf + stateStart, "RDEM", 4) == 0
- || memcmp(inbuf + stateStart, "RDKM", 4) == 0
- )
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- } else if ( p - stateStart == 43 && 'k' <= inbuf[stateStart + 10] && inbuf[stateStart + 10] <= 'n'
- && ( memcmp(inbuf + stateStart, "RDCLAK5uy_", 10) == 0
- || memcmp(inbuf + stateStart, "RDTMAK5uy_", 10) == 0
- )
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- } else if ( p - stateStart == 41 && 'k' <= inbuf[stateStart + 8] && inbuf[stateStart + 8] <= 'n'
- && memcmp(inbuf + stateStart, "OLAK5uy_", 8) == 0
- ) {
- printf("p %.*s\n", p - stateStart, inbuf + stateStart);
- } else {
- printf("? %.*s\n", p - stateStart, inbuf + stateStart);
- }
- }
- }
- debug_print("%s\n", "Switching to STATE_NONE");
- state = STATE_NONE;
- stateStart = 0;
- }
- break;
-
- case STATE_SKIP_UNTIL_NONID:
- if (!IS_ID_CHAR(inbuf[p])) {
- debug_print("%s\n", "Switching to STATE_NONE");
- state = STATE_NONE;
- }
- break;
- }
- }
- if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID) {
- // Need to keep the trailing part of the buffer for the next iteration.
- // Because stateStart gets reset to zero when it exceeds MAX_RESULT_SIZE, inbufEnd + readSize - stateStart is guaranteed to be smaller than MAX_RESULT_SIZE.
- // Because MAX_RESULT_SIZE < BUFFER_SIZE/2, we can simply copy the last bytes to the beginning of inbuf directly.
- debug_print("Copying %d bytes starting from %d to the beginning of the buffer: %.*s\n",
- inbufEnd + readSize - stateStart, stateStart, inbufEnd + readSize - stateStart, inbuf + stateStart);
- memcpy(inbuf, inbuf + stateStart, inbufEnd + readSize - stateStart);
- inbufEnd += readSize - stateStart;
- stateStart = 0;
- } else {
- debug_print("%s\n", "No buffer copying necessary");
- inbufEnd = 0;
- }
- }
- }
|