From 479c26844111e8b7ed93385faedf8ebb3c0a354c Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 25 Jul 2021 19:23:41 +0000 Subject: [PATCH] Fix whitespace handling --- youtube-extract | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/youtube-extract b/youtube-extract index 2582c8b..fb4e49e 100755 --- a/youtube-extract +++ b/youtube-extract @@ -56,7 +56,7 @@ channelPattern = '|'.join([ r'/www\.youtube\.com/c/[^/?&=.]+', r'/www\.youtube\.com/user/[^/?&=.]+', r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}', - r'/www\.youtube\.com/[^/?&=.]+(?=/?$)', + r'/www\.youtube\.com/[^/?&=.\s]+(?=/?(\s|$))', ]) # Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K). @@ -114,19 +114,21 @@ for origLine in sys.stdin: origLine = origLine.strip() line = re.sub(r'^https?://', '//', origLine) line = domainPattern.sub('/www.youtube.com/', line) + candidates = re.split(r'\s+', line) hadMatches = False - for pattern, paramSearch, f in matchers: - results = set() - for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(percentdecode(line))) if paramSearch else ()): - hadMatches = True - r = f(m) - if r in results: - continue - results.add(r) - if r is None: + for candidate in candidates: + for pattern, paramSearch, f in matchers: + results = set() + for m in itertools.chain((x for x in pattern.finditer(candidate)), (x for x in pattern.finditer(percentdecode(candidate))) if paramSearch else ()): + hadMatches = True + r = f(m) + if r in results: + continue + results.add(r) + if r is None: + break + print(r) + if None in results: break - print(r) - if None in results: - break if not hadMatches: print(origLine, file = sys.stderr)