// stdin: YouTube URLs or data with little noise besides that // stdout: lines for videos, channels, playlists, and unknown YouTube IDs found in the input, prefixed with v, c, p, and ?, respectively #include #include #include #include #ifndef DEBUG #define DEBUG 0 #endif #define debug_print(fmt, ...) do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0) #define BUFFER_SIZE 1024 * 1024 #define MAX_RESULT_SIZE 1024 // MAX_RESULT_SIZE is the maximum length of an individual match. This must be smaller than BUFFER_SIZE/2. #define STATE_NONE 0 #define STATE_ID 1 #define STATE_SKIP_UNTIL_NONID 2 #define IS_ID_CHAR(c) (('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == '-') inline bool is_upper_hex(char* c, size_t len) { for (size_t i = 0; i < len; i++) { if (!(('0' <= c[i] && c[i] <= '9') || ('A' <= c[i] && c[i] <= 'F'))) return false; } return true; } int main(int argc, char** argv) { char* inbuf = malloc(sizeof(char) * BUFFER_SIZE); int state = 0; size_t stateStart = 0; size_t inbufEnd = 0; while (1) { debug_print("inbufEnd = %d, state = %d, stateStart = %d\n", inbufEnd, state, stateStart); debug_print("Reading %d from stdin\n", BUFFER_SIZE - inbufEnd); size_t readSize = fread(inbuf + inbufEnd, sizeof(char), BUFFER_SIZE - inbufEnd, stdin); debug_print("Got %d bytes\n", readSize); if (readSize == 0) { if (inbufEnd == 0) { // Nothing read, nothing left from previous iteration. Bye. break; } else { // No more input data but still something left from the previous read. // Make sure that the next character cannot be considered valid in any state (NUL qualifies), then let the code below handle things. inbuf[inbufEnd] = '\0'; readSize += 1; } } for (size_t p = inbufEnd; p < inbufEnd + readSize; p++) { debug_print("p = %d, character = %c, state = %d, stateStart = %d\n", p, inbuf[p], state, stateStart); if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID && p - stateStart >= MAX_RESULT_SIZE) { debug_print("%s\n", "max result size exceeded, dropping result and switching to STATE_SKIP_UNTIL_NONID"); state = STATE_SKIP_UNTIL_NONID; stateStart = 0; } switch (state) { case STATE_NONE: if (IS_ID_CHAR(inbuf[p])) { debug_print("%c is an ID char, switching to STATE_ID\n", inbuf[p]); state = STATE_ID; stateStart = p; } break; case STATE_ID: if (!IS_ID_CHAR(inbuf[p])) { debug_print("%c is not an ID char\n", inbuf[p]); if (p - stateStart >= 10) { debug_print("p = %d, stateStart = %d, got %d ID chars: %.*s\n", p, stateStart, p - stateStart, p - stateStart, inbuf + stateStart); if (p - stateStart == 11) { printf("v %.*s\n", p - stateStart, inbuf + stateStart); } else if (p - stateStart == 22) { printf("c %.*s\n", p - stateStart, inbuf + stateStart); } else if (p - stateStart == 24 && inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'C') { printf("c %.*s\n", p - stateStart, inbuf + stateStart); } else { // Playlist candidates, some of which contain IDs for channels or videos if (p - stateStart >= 19 && memcmp(inbuf + stateStart, "RDAMPL", 6) == 0) { // Playlist ID starts with RDAMPL, which is followed by a normal playlist ID, so skip that. stateStart += 6; } if (p - stateStart == 32 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 34)) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); } else if ( (p - stateStart == 16 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 18)) && is_upper_hex(inbuf + p - 16, 16) ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); } else if (p - stateStart == 24 && ( (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'U') || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'L') || (inbuf[stateStart] == 'F' && inbuf[stateStart + 1] == 'L') || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'U') ) ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); printf("c %.*s\n", 22, inbuf + stateStart + 2); } else if (p - stateStart == 26 && memcmp(inbuf + stateStart, "UUSH", 4) == 0) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); printf("c %.*s\n", 22, inbuf + stateStart + 4); } else if (p - stateStart == 13 && ( (inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D') || (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'L') || (inbuf[stateStart] == 'E' && inbuf[stateStart + 1] == 'L') || (inbuf[stateStart] == 'C' && inbuf[stateStart + 1] == 'L') || (inbuf[stateStart] == 'S' && inbuf[stateStart + 1] == 'L') || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'P') ) ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); printf("v %.*s\n", 11, inbuf + stateStart + 2); } else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D' && ( (inbuf[stateStart + 2] == 'M' && inbuf[stateStart + 3] == 'M') || (inbuf[stateStart + 2] == 'Q' && inbuf[stateStart + 3] == 'M') || (inbuf[stateStart + 2] == 'E' && inbuf[stateStart + 3] == 'M') || (inbuf[stateStart + 2] == 'L' && inbuf[stateStart + 3] == 'V') || (inbuf[stateStart + 2] == 'H' && inbuf[stateStart + 3] == 'C') ) ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); printf("v %.*s\n", 11, inbuf + stateStart + 4); } else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D' && '0' <= inbuf[stateStart + 2] && inbuf[stateStart + 2] <= '4' && '0' <= inbuf[stateStart + 3] && inbuf[stateStart + 3] <= '9' ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); printf("v %.*s\n", 11, inbuf + stateStart + 4); } else if ( p - stateStart == 28 && ( memcmp(inbuf + stateStart, "RDAMVM", 6) == 0 || memcmp(inbuf + stateStart, "RDGMEM", 6) == 0 ) ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); } else if (p - stateStart == 28 && memcmp(inbuf + stateStart, "RDCMUC", 6) == 0) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); printf("c %.*s\n", 24, inbuf + stateStart + 4); } else if ( p - stateStart == 41 && memcmp(inbuf + stateStart, "RDGMEM", 6) == 0 && inbuf[stateStart + 28] == 'V' && inbuf[stateStart + 29] == 'M' ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); printf("v %.*s\n", 11, inbuf + stateStart + 30); } else if (p - stateStart == 26 && ( memcmp(inbuf + stateStart, "RDAO", 4) == 0 || memcmp(inbuf + stateStart, "RDEM", 4) == 0 || memcmp(inbuf + stateStart, "RDKM", 4) == 0 ) ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); } else if ( p - stateStart == 43 && 'k' <= inbuf[stateStart + 10] && inbuf[stateStart + 10] <= 'n' && ( memcmp(inbuf + stateStart, "RDCLAK5uy_", 10) == 0 || memcmp(inbuf + stateStart, "RDTMAK5uy_", 10) == 0 ) ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); } else if ( p - stateStart == 41 && 'k' <= inbuf[stateStart + 8] && inbuf[stateStart + 8] <= 'n' && memcmp(inbuf + stateStart, "OLAK5uy_", 8) == 0 ) { printf("p %.*s\n", p - stateStart, inbuf + stateStart); } else { printf("? %.*s\n", p - stateStart, inbuf + stateStart); } } } debug_print("%s\n", "Switching to STATE_NONE"); state = STATE_NONE; stateStart = 0; } break; case STATE_SKIP_UNTIL_NONID: if (!IS_ID_CHAR(inbuf[p])) { debug_print("%s\n", "Switching to STATE_NONE"); state = STATE_NONE; } break; } } if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID) { // Need to keep the trailing part of the buffer for the next iteration. // Because stateStart gets reset to zero when it exceeds MAX_RESULT_SIZE, inbufEnd + readSize - stateStart is guaranteed to be smaller than MAX_RESULT_SIZE. // Because MAX_RESULT_SIZE < BUFFER_SIZE/2, we can simply copy the last bytes to the beginning of inbuf directly. debug_print("Copying %d bytes starting from %d to the beginning of the buffer: %.*s\n", inbufEnd + readSize - stateStart, stateStart, inbufEnd + readSize - stateStart, inbuf + stateStart); memcpy(inbuf, inbuf + stateStart, inbufEnd + readSize - stateStart); inbufEnd += readSize - stateStart; stateStart = 0; } else { debug_print("%s\n", "No buffer copying necessary"); inbufEnd = 0; } } }