|
|
@@ -0,0 +1,192 @@ |
|
|
|
// stdin: YouTube URLs or data with little noise besides that |
|
|
|
// stdout: lines for videos, channels, playlists, and unknown YouTube IDs found in the input, prefixed with v, c, p, and ?, respectively |
|
|
|
|
|
|
|
#include <stdbool.h> |
|
|
|
#include <stdio.h> |
|
|
|
#include <stdlib.h> |
|
|
|
#include <string.h> |
|
|
|
|
|
|
|
#ifndef DEBUG |
|
|
|
#define DEBUG 0 |
|
|
|
#endif |
|
|
|
#define debug_print(fmt, ...) do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0) |
|
|
|
|
|
|
|
#define BUFFER_SIZE 1024 * 1024 |
|
|
|
#define MAX_RESULT_SIZE 1024 |
|
|
|
// MAX_RESULT_SIZE is the maximum length of an individual match. This must be smaller than BUFFER_SIZE/2. |
|
|
|
|
|
|
|
#define STATE_NONE 0 |
|
|
|
#define STATE_ID 1 |
|
|
|
#define STATE_SKIP_UNTIL_NONID 2 |
|
|
|
|
|
|
|
#define IS_ID_CHAR(c) (('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == '-') |
|
|
|
|
|
|
|
inline bool is_upper_hex(char* c, size_t len) { |
|
|
|
for (size_t i = 0; i < len; i++) { |
|
|
|
if (!(('0' <= c[i] && c[i] <= '9') || ('A' <= c[i] && c[i] <= 'F'))) |
|
|
|
return false; |
|
|
|
} |
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
int main(int argc, char** argv) { |
|
|
|
char* inbuf = malloc(sizeof(char) * BUFFER_SIZE); |
|
|
|
int state = 0; |
|
|
|
size_t stateStart = 0; |
|
|
|
size_t inbufEnd = 0; |
|
|
|
|
|
|
|
while (1) { |
|
|
|
debug_print("inbufEnd = %d, state = %d, stateStart = %d\n", inbufEnd, state, stateStart); |
|
|
|
debug_print("Reading %d from stdin\n", BUFFER_SIZE - inbufEnd); |
|
|
|
size_t readSize = fread(inbuf + inbufEnd, sizeof(char), BUFFER_SIZE - inbufEnd, stdin); |
|
|
|
debug_print("Got %d bytes\n", readSize); |
|
|
|
if (readSize == 0) { |
|
|
|
if (inbufEnd == 0) { |
|
|
|
// Nothing read, nothing left from previous iteration. Bye. |
|
|
|
break; |
|
|
|
} else { |
|
|
|
// No more input data but still something left from the previous read. |
|
|
|
// Make sure that the next character cannot be considered valid in any state (NUL qualifies), then let the code below handle things. |
|
|
|
inbuf[inbufEnd] = '\0'; |
|
|
|
readSize += 1; |
|
|
|
} |
|
|
|
} |
|
|
|
for (size_t p = inbufEnd; p < inbufEnd + readSize; p++) { |
|
|
|
debug_print("p = %d, character = %c, state = %d, stateStart = %d\n", p, inbuf[p], state, stateStart); |
|
|
|
if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID && p - stateStart >= MAX_RESULT_SIZE) { |
|
|
|
debug_print("%s\n", "max result size exceeded, dropping result and switching to STATE_SKIP_UNTIL_NONID"); |
|
|
|
state = STATE_SKIP_UNTIL_NONID; |
|
|
|
stateStart = 0; |
|
|
|
} |
|
|
|
switch (state) { |
|
|
|
case STATE_NONE: |
|
|
|
if (IS_ID_CHAR(inbuf[p])) { |
|
|
|
debug_print("%c is an ID char, switching to STATE_ID\n", inbuf[p]); |
|
|
|
state = STATE_ID; |
|
|
|
stateStart = p; |
|
|
|
} |
|
|
|
break; |
|
|
|
|
|
|
|
case STATE_ID: |
|
|
|
if (!IS_ID_CHAR(inbuf[p])) { |
|
|
|
debug_print("%c is not an ID char\n", inbuf[p]); |
|
|
|
if (p - stateStart >= 10) { |
|
|
|
debug_print("p = %d, stateStart = %d, got %d ID chars: %.*s\n", p, stateStart, p - stateStart, p - stateStart, inbuf + stateStart); |
|
|
|
if (p - stateStart == 11) { |
|
|
|
printf("v %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else if (p - stateStart == 22) { |
|
|
|
printf("c %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else if (p - stateStart == 24 && inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'C') { |
|
|
|
printf("c %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else { |
|
|
|
// Playlist candidates, some of which contain IDs for channels or videos |
|
|
|
if (p - stateStart >= 19 && memcmp(inbuf + stateStart, "RDAMPL", 6) == 0) { |
|
|
|
// Playlist ID starts with RDAMPL, which is followed by a normal playlist ID, so skip that. |
|
|
|
stateStart += 6; |
|
|
|
} |
|
|
|
if (p - stateStart == 32 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 34)) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else if ( (p - stateStart == 16 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 18)) |
|
|
|
&& is_upper_hex(inbuf + p - 16, 16) |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else if (p - stateStart == 24 && ( (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'U') |
|
|
|
|| (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'L') |
|
|
|
|| (inbuf[stateStart] == 'F' && inbuf[stateStart + 1] == 'L') |
|
|
|
|| (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'U') |
|
|
|
) |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
printf("c %.*s\n", 22, inbuf + stateStart + 2); |
|
|
|
} else if (p - stateStart == 26 && memcmp(inbuf + stateStart, "UUSH", 4) == 0) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
printf("c %.*s\n", 22, inbuf + stateStart + 4); |
|
|
|
} else if (p - stateStart == 13 && ( (inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D') |
|
|
|
|| (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'L') |
|
|
|
|| (inbuf[stateStart] == 'E' && inbuf[stateStart + 1] == 'L') |
|
|
|
|| (inbuf[stateStart] == 'C' && inbuf[stateStart + 1] == 'L') |
|
|
|
|| (inbuf[stateStart] == 'S' && inbuf[stateStart + 1] == 'L') |
|
|
|
|| (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'P') |
|
|
|
) |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
printf("v %.*s\n", 11, inbuf + stateStart + 2); |
|
|
|
} else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D' |
|
|
|
&& ( (inbuf[stateStart + 2] == 'M' && inbuf[stateStart + 3] == 'M') |
|
|
|
|| (inbuf[stateStart + 2] == 'Q' && inbuf[stateStart + 3] == 'M') |
|
|
|
|| (inbuf[stateStart + 2] == 'E' && inbuf[stateStart + 3] == 'M') |
|
|
|
|| (inbuf[stateStart + 2] == 'L' && inbuf[stateStart + 3] == 'V') |
|
|
|
|| (inbuf[stateStart + 2] == 'H' && inbuf[stateStart + 3] == 'C') |
|
|
|
) |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
printf("v %.*s\n", 11, inbuf + stateStart + 4); |
|
|
|
} else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D' |
|
|
|
&& '0' <= inbuf[stateStart + 2] && inbuf[stateStart + 2] <= '4' |
|
|
|
&& '0' <= inbuf[stateStart + 3] && inbuf[stateStart + 3] <= '9' |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
printf("v %.*s\n", 11, inbuf + stateStart + 4); |
|
|
|
} else if ( p - stateStart == 28 && ( memcmp(inbuf + stateStart, "RDAMVM", 6) == 0 |
|
|
|
|| memcmp(inbuf + stateStart, "RDGMEM", 6) == 0 |
|
|
|
) |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else if (p - stateStart == 28 && memcmp(inbuf + stateStart, "RDCMUC", 6) == 0) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
printf("c %.*s\n", 24, inbuf + stateStart + 4); |
|
|
|
} else if ( p - stateStart == 41 && memcmp(inbuf + stateStart, "RDGMEM", 6) == 0 |
|
|
|
&& inbuf[stateStart + 28] == 'V' && inbuf[stateStart + 29] == 'M' |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
printf("v %.*s\n", 11, inbuf + stateStart + 30); |
|
|
|
} else if (p - stateStart == 26 && ( memcmp(inbuf + stateStart, "RDAO", 4) == 0 |
|
|
|
|| memcmp(inbuf + stateStart, "RDEM", 4) == 0 |
|
|
|
|| memcmp(inbuf + stateStart, "RDKM", 4) == 0 |
|
|
|
) |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else if ( p - stateStart == 43 && 'k' <= inbuf[stateStart + 10] && inbuf[stateStart + 10] <= 'n' |
|
|
|
&& ( memcmp(inbuf + stateStart, "RDCLAK5uy_", 10) == 0 |
|
|
|
|| memcmp(inbuf + stateStart, "RDTMAK5uy_", 10) == 0 |
|
|
|
) |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else if ( p - stateStart == 41 && 'k' <= inbuf[stateStart + 8] && inbuf[stateStart + 8] <= 'n' |
|
|
|
&& memcmp(inbuf + stateStart, "OLAK5uy_", 8) == 0 |
|
|
|
) { |
|
|
|
printf("p %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} else { |
|
|
|
printf("? %.*s\n", p - stateStart, inbuf + stateStart); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
debug_print("%s\n", "Switching to STATE_NONE"); |
|
|
|
state = STATE_NONE; |
|
|
|
stateStart = 0; |
|
|
|
} |
|
|
|
break; |
|
|
|
|
|
|
|
case STATE_SKIP_UNTIL_NONID: |
|
|
|
if (!IS_ID_CHAR(inbuf[p])) { |
|
|
|
debug_print("%s\n", "Switching to STATE_NONE"); |
|
|
|
state = STATE_NONE; |
|
|
|
} |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID) { |
|
|
|
// Need to keep the trailing part of the buffer for the next iteration. |
|
|
|
// Because stateStart gets reset to zero when it exceeds MAX_RESULT_SIZE, inbufEnd + readSize - stateStart is guaranteed to be smaller than MAX_RESULT_SIZE. |
|
|
|
// Because MAX_RESULT_SIZE < BUFFER_SIZE/2, we can simply copy the last bytes to the beginning of inbuf directly. |
|
|
|
debug_print("Copying %d bytes starting from %d to the beginning of the buffer: %.*s\n", |
|
|
|
inbufEnd + readSize - stateStart, stateStart, inbufEnd + readSize - stateStart, inbuf + stateStart); |
|
|
|
memcpy(inbuf, inbuf + stateStart, inbufEnd + readSize - stateStart); |
|
|
|
inbufEnd += readSize - stateStart; |
|
|
|
stateStart = 0; |
|
|
|
} else { |
|
|
|
debug_print("%s\n", "No buffer copying necessary"); |
|
|
|
inbufEnd = 0; |
|
|
|
} |
|
|
|
} |
|
|
|
} |