From 360c4d93710eadbd03792100bb3c7c85e79748bd Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 26 Nov 2021 22:11:41 +0000 Subject: [PATCH] Add youtube-extract-rapid --- .youtube-extract-rapid-test | 95 ++++++++++++++++++ youtube-extract-rapid | 1 + youtube-extract-rapid.c | 192 ++++++++++++++++++++++++++++++++++++ 3 files changed, 288 insertions(+) create mode 100755 .youtube-extract-rapid-test create mode 120000 youtube-extract-rapid create mode 100644 youtube-extract-rapid.c diff --git a/.youtube-extract-rapid-test b/.youtube-extract-rapid-test new file mode 100755 index 0000000..f494fcc --- /dev/null +++ b/.youtube-extract-rapid-test @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +import itertools +import subprocess + + +def test(input, lines): + p = subprocess.Popen(['./.make-and-exec-binaries/youtube-extract-rapid'], text = False, stdin = subprocess.PIPE, stdout = subprocess.PIPE) + stdout, stderr = p.communicate(input) + assert not stderr + stdout = stdout.split(b'\n') + assert stdout[-1] == b'' and stdout[:-1] == lines, f'Got {stdout!r} instead of {lines!r} from {input!r}' + + +def is_id_char(c): + return b'0' <= c <= b'9' or b'a' <= c <= b'z' or b'A' <= c <= b'Z' or c == b'_' or c == b'-' + + +def bytes_range(a, b): + # Yields every char between a and b (inclusive) as a bytes object + return map(lambda x: bytes([x]), range(ord(a), ord(b) + 1)) + + +test(b'', []) +test(b'short\n', []) +test(b'01234567890', [b'v 01234567890']) +test(b'01234567890\n', [b'v 01234567890']) + +# Videos +input = [] +for a in map(lambda x: bytes.fromhex(f'{x:02x}'), range(256)): + if is_id_char(a): + continue + for b in map(lambda x: bytes.fromhex(f'{x:02x}'), range(256)): + if is_id_char(b): + continue + input.append(a + b'0aA_-1bB_-2' + b) +test(b''.join(input), [b'v 0aA_-1bB_-2'] * len(input)) + +# Channels +test(b'0123456789abcdeFGHIJ_-', [b'c 0123456789abcdeFGHIJ_-']) +test(b'UC0123456789abcdeFGHIJ_-', [b'c UC0123456789abcdeFGHIJ_-']) + +# Pure playlists +playlists = [ + b'0123456789ABCDEF', + b'PL0123456789ABCDEF', + b'0123456789abcdefghijABCDEFGHIJ_-', + b'PL0123456789abcdefghijABCDEFGHIJ_-', + b'RDAMVM0123456789abcdeFGHIJ_-', + b'RDGMEM0123456789abcdeFGHIJ_-', + b'RDAO0123456789abcdeFGHIJ_-', + b'RDEM0123456789abcdeFGHIJ_-', + b'RDKM0123456789abcdeFGHIJ_-', +] +for playlist in playlists: + test(playlist, [b'p ' + playlist]) + +# Music playlist madness +for prefix in (b'RDCLAK5uy_', b'RDTMAK5uy_', b'OLAK5uy_'): + for c in bytes_range(b'k', b'n'): + test(prefix + c + b'0123456789abcdefghijABCDEFGHIJ_-', [b'p ' + prefix + c + b'0123456789abcdefghijABCDEFGHIJ_-']) + +# Playlists with video IDs +for prefix in (b'RD', b'UL', b'EL', b'CL', b'SL', b'LP', b'RDMM', b'RDQM', b'RDEM', b'RDLV', b'RDHC'): + test(prefix + b'0aA_-1bB_-2', [b'p ' + prefix + b'0aA_-1bB_-2', b'v 0aA_-1bB_-2']) +for a, b in itertools.product(bytes_range(b'0', b'4'), bytes_range(b'0', b'9')): + playlist = b'RD' + a + b + b'0aA_-1bB_-2' + test(playlist, [b'p ' + playlist, b'v 0aA_-1bB_-2']) +playlist = b'RDGMEM' + b'0123456789abcdeFGHIJ_-' + b'VM0aA_-1bB_-2' +test(playlist, [b'p ' + playlist, b'v 0aA_-1bB_-2']) + +# Playlists with channel IDs +for prefix in (b'UU', b'LL', b'FL', b'PU', b'UUSH'): + test(prefix + b'0123456789abcdeFGHIJ_-', [b'p ' + prefix + b'0123456789abcdeFGHIJ_-', b'c 0123456789abcdeFGHIJ_-']) +test(b'RDCMUC0123456789abcdeFGHIJ_-', [b'p RDCMUC0123456789abcdeFGHIJ_-', b'c UC0123456789abcdeFGHIJ_-']) + +# Some particular unrecognised IDs +ids = [ + b'0123456789ABCDEG', + b'PL0123456789ABCDEG', + b'RDCLAK5uy_j0123456789abcdefghijABCDEFGHIJ_-', + b'RDCLAK5uy_o0123456789abcdefghijABCDEFGHIJ_-', +] +for id_ in ids: + test(id_, [b'? ' + id_]) + +# Buffer rollover +BUFFER_SIZE = 1024 * 1024 +for offset in range(-11, 1): + test(b'?' * (BUFFER_SIZE + offset) + b'0aA_-1bB_-2', [b'v 0aA_-1bB_-2']) + +# Max length exceedance +MAX_RESULT_SIZE = 1024 +for length in range(MAX_RESULT_SIZE + 1, MAX_RESULT_SIZE + 15): + test(b'0' * length, []) diff --git a/youtube-extract-rapid b/youtube-extract-rapid new file mode 120000 index 0000000..4c68fa7 --- /dev/null +++ b/youtube-extract-rapid @@ -0,0 +1 @@ +.make-and-exec \ No newline at end of file diff --git a/youtube-extract-rapid.c b/youtube-extract-rapid.c new file mode 100644 index 0000000..f62a2ef --- /dev/null +++ b/youtube-extract-rapid.c @@ -0,0 +1,192 @@ +// stdin: YouTube URLs or data with little noise besides that +// stdout: lines for videos, channels, playlists, and unknown YouTube IDs found in the input, prefixed with v, c, p, and ?, respectively + +#include +#include +#include +#include + +#ifndef DEBUG +#define DEBUG 0 +#endif +#define debug_print(fmt, ...) do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0) + +#define BUFFER_SIZE 1024 * 1024 +#define MAX_RESULT_SIZE 1024 +// MAX_RESULT_SIZE is the maximum length of an individual match. This must be smaller than BUFFER_SIZE/2. + +#define STATE_NONE 0 +#define STATE_ID 1 +#define STATE_SKIP_UNTIL_NONID 2 + +#define IS_ID_CHAR(c) (('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == '-') + +inline bool is_upper_hex(char* c, size_t len) { + for (size_t i = 0; i < len; i++) { + if (!(('0' <= c[i] && c[i] <= '9') || ('A' <= c[i] && c[i] <= 'F'))) + return false; + } + return true; +} + +int main(int argc, char** argv) { + char* inbuf = malloc(sizeof(char) * BUFFER_SIZE); + int state = 0; + size_t stateStart = 0; + size_t inbufEnd = 0; + + while (1) { + debug_print("inbufEnd = %d, state = %d, stateStart = %d\n", inbufEnd, state, stateStart); + debug_print("Reading %d from stdin\n", BUFFER_SIZE - inbufEnd); + size_t readSize = fread(inbuf + inbufEnd, sizeof(char), BUFFER_SIZE - inbufEnd, stdin); + debug_print("Got %d bytes\n", readSize); + if (readSize == 0) { + if (inbufEnd == 0) { + // Nothing read, nothing left from previous iteration. Bye. + break; + } else { + // No more input data but still something left from the previous read. + // Make sure that the next character cannot be considered valid in any state (NUL qualifies), then let the code below handle things. + inbuf[inbufEnd] = '\0'; + readSize += 1; + } + } + for (size_t p = inbufEnd; p < inbufEnd + readSize; p++) { + debug_print("p = %d, character = %c, state = %d, stateStart = %d\n", p, inbuf[p], state, stateStart); + if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID && p - stateStart >= MAX_RESULT_SIZE) { + debug_print("%s\n", "max result size exceeded, dropping result and switching to STATE_SKIP_UNTIL_NONID"); + state = STATE_SKIP_UNTIL_NONID; + stateStart = 0; + } + switch (state) { + case STATE_NONE: + if (IS_ID_CHAR(inbuf[p])) { + debug_print("%c is an ID char, switching to STATE_ID\n", inbuf[p]); + state = STATE_ID; + stateStart = p; + } + break; + + case STATE_ID: + if (!IS_ID_CHAR(inbuf[p])) { + debug_print("%c is not an ID char\n", inbuf[p]); + if (p - stateStart >= 10) { + debug_print("p = %d, stateStart = %d, got %d ID chars: %.*s\n", p, stateStart, p - stateStart, p - stateStart, inbuf + stateStart); + if (p - stateStart == 11) { + printf("v %.*s\n", p - stateStart, inbuf + stateStart); + } else if (p - stateStart == 22) { + printf("c %.*s\n", p - stateStart, inbuf + stateStart); + } else if (p - stateStart == 24 && inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'C') { + printf("c %.*s\n", p - stateStart, inbuf + stateStart); + } else { + // Playlist candidates, some of which contain IDs for channels or videos + if (p - stateStart >= 19 && memcmp(inbuf + stateStart, "RDAMPL", 6) == 0) { + // Playlist ID starts with RDAMPL, which is followed by a normal playlist ID, so skip that. + stateStart += 6; + } + if (p - stateStart == 32 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 34)) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + } else if ( (p - stateStart == 16 || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'L' && p - stateStart == 18)) + && is_upper_hex(inbuf + p - 16, 16) + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + } else if (p - stateStart == 24 && ( (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'U') + || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'L') + || (inbuf[stateStart] == 'F' && inbuf[stateStart + 1] == 'L') + || (inbuf[stateStart] == 'P' && inbuf[stateStart + 1] == 'U') + ) + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + printf("c %.*s\n", 22, inbuf + stateStart + 2); + } else if (p - stateStart == 26 && memcmp(inbuf + stateStart, "UUSH", 4) == 0) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + printf("c %.*s\n", 22, inbuf + stateStart + 4); + } else if (p - stateStart == 13 && ( (inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D') + || (inbuf[stateStart] == 'U' && inbuf[stateStart + 1] == 'L') + || (inbuf[stateStart] == 'E' && inbuf[stateStart + 1] == 'L') + || (inbuf[stateStart] == 'C' && inbuf[stateStart + 1] == 'L') + || (inbuf[stateStart] == 'S' && inbuf[stateStart + 1] == 'L') + || (inbuf[stateStart] == 'L' && inbuf[stateStart + 1] == 'P') + ) + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + printf("v %.*s\n", 11, inbuf + stateStart + 2); + } else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D' + && ( (inbuf[stateStart + 2] == 'M' && inbuf[stateStart + 3] == 'M') + || (inbuf[stateStart + 2] == 'Q' && inbuf[stateStart + 3] == 'M') + || (inbuf[stateStart + 2] == 'E' && inbuf[stateStart + 3] == 'M') + || (inbuf[stateStart + 2] == 'L' && inbuf[stateStart + 3] == 'V') + || (inbuf[stateStart + 2] == 'H' && inbuf[stateStart + 3] == 'C') + ) + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + printf("v %.*s\n", 11, inbuf + stateStart + 4); + } else if ( p - stateStart == 15 && inbuf[stateStart] == 'R' && inbuf[stateStart + 1] == 'D' + && '0' <= inbuf[stateStart + 2] && inbuf[stateStart + 2] <= '4' + && '0' <= inbuf[stateStart + 3] && inbuf[stateStart + 3] <= '9' + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + printf("v %.*s\n", 11, inbuf + stateStart + 4); + } else if ( p - stateStart == 28 && ( memcmp(inbuf + stateStart, "RDAMVM", 6) == 0 + || memcmp(inbuf + stateStart, "RDGMEM", 6) == 0 + ) + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + } else if (p - stateStart == 28 && memcmp(inbuf + stateStart, "RDCMUC", 6) == 0) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + printf("c %.*s\n", 24, inbuf + stateStart + 4); + } else if ( p - stateStart == 41 && memcmp(inbuf + stateStart, "RDGMEM", 6) == 0 + && inbuf[stateStart + 28] == 'V' && inbuf[stateStart + 29] == 'M' + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + printf("v %.*s\n", 11, inbuf + stateStart + 30); + } else if (p - stateStart == 26 && ( memcmp(inbuf + stateStart, "RDAO", 4) == 0 + || memcmp(inbuf + stateStart, "RDEM", 4) == 0 + || memcmp(inbuf + stateStart, "RDKM", 4) == 0 + ) + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + } else if ( p - stateStart == 43 && 'k' <= inbuf[stateStart + 10] && inbuf[stateStart + 10] <= 'n' + && ( memcmp(inbuf + stateStart, "RDCLAK5uy_", 10) == 0 + || memcmp(inbuf + stateStart, "RDTMAK5uy_", 10) == 0 + ) + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + } else if ( p - stateStart == 41 && 'k' <= inbuf[stateStart + 8] && inbuf[stateStart + 8] <= 'n' + && memcmp(inbuf + stateStart, "OLAK5uy_", 8) == 0 + ) { + printf("p %.*s\n", p - stateStart, inbuf + stateStart); + } else { + printf("? %.*s\n", p - stateStart, inbuf + stateStart); + } + } + } + debug_print("%s\n", "Switching to STATE_NONE"); + state = STATE_NONE; + stateStart = 0; + } + break; + + case STATE_SKIP_UNTIL_NONID: + if (!IS_ID_CHAR(inbuf[p])) { + debug_print("%s\n", "Switching to STATE_NONE"); + state = STATE_NONE; + } + break; + } + } + if (state != STATE_NONE && state != STATE_SKIP_UNTIL_NONID) { + // Need to keep the trailing part of the buffer for the next iteration. + // Because stateStart gets reset to zero when it exceeds MAX_RESULT_SIZE, inbufEnd + readSize - stateStart is guaranteed to be smaller than MAX_RESULT_SIZE. + // Because MAX_RESULT_SIZE < BUFFER_SIZE/2, we can simply copy the last bytes to the beginning of inbuf directly. + debug_print("Copying %d bytes starting from %d to the beginning of the buffer: %.*s\n", + inbufEnd + readSize - stateStart, stateStart, inbufEnd + readSize - stateStart, inbuf + stateStart); + memcpy(inbuf, inbuf + stateStart, inbufEnd + readSize - stateStart); + inbufEnd += readSize - stateStart; + stateStart = 0; + } else { + debug_print("%s\n", "No buffer copying necessary"); + inbufEnd = 0; + } + } +}