Browse Source

Fix whitespace handling

master
JustAnotherArchivist 2 years ago
parent
commit
479c268441
1 changed files with 15 additions and 13 deletions
  1. +15
    -13
      youtube-extract

+ 15
- 13
youtube-extract View File

@@ -56,7 +56,7 @@ channelPattern = '|'.join([
r'/www\.youtube\.com/c/[^/?&=.]+',
r'/www\.youtube\.com/user/[^/?&=.]+',
r'/www\.youtube\.com/channel/UC[0-9A-Za-z_-]{22}',
r'/www\.youtube\.com/[^/?&=.]+(?=/?$)',
r'/www\.youtube\.com/[^/?&=.\s]+(?=/?(\s|$))',
])

# Make sure that the last 11 chars of the match are always the video ID (because Python's re doesn't support \K).
@@ -114,19 +114,21 @@ for origLine in sys.stdin:
origLine = origLine.strip()
line = re.sub(r'^https?://', '//', origLine)
line = domainPattern.sub('/www.youtube.com/', line)
candidates = re.split(r'\s+', line)
hadMatches = False
for pattern, paramSearch, f in matchers:
results = set()
for m in itertools.chain((x for x in pattern.finditer(line)), (x for x in pattern.finditer(percentdecode(line))) if paramSearch else ()):
hadMatches = True
r = f(m)
if r in results:
continue
results.add(r)
if r is None:
for candidate in candidates:
for pattern, paramSearch, f in matchers:
results = set()
for m in itertools.chain((x for x in pattern.finditer(candidate)), (x for x in pattern.finditer(percentdecode(candidate))) if paramSearch else ()):
hadMatches = True
r = f(m)
if r in results:
continue
results.add(r)
if r is None:
break
print(r)
if None in results:
break
print(r)
if None in results:
break
if not hadMatches:
print(origLine, file = sys.stderr)

Loading…
Cancel
Save