Browse Source

Fix str_get_all_between yielding half-overlapping matches

master
JustAnotherArchivist 3 years ago
parent
commit
a4cf1a4225
1 changed files with 5 additions and 4 deletions
  1. +5
    -4
      qwarc/utils.py

+ 5
- 4
qwarc/utils.py View File

@@ -91,16 +91,17 @@ def maybe_str_get_between(x, a, b):


def str_get_all_between(aStr, a, b):
'''Generator yielding every string between occurrences of a in aStr and the following occurrence of b.'''

#TODO: This produces half-overlapping matches: str_get_all_between('aabc', 'a', 'c') will yield 'ab' and 'b'.
# Might need to implement sending an offset to the find_all generator to work around this, or discard aOffset values which are smaller than the previous bPos+len(b).
'''Generator yielding every string between an occurrence of a in aStr and the following occurrence of b.'''

prevEnd = -1
for aOffset in find_all(aStr, a):
if aOffset < prevEnd:
continue
offset = aOffset + len(a)
bPos = aStr.find(b, offset)
if bPos != -1:
yield aStr[offset:bPos]
prevEnd = bPos + len(b)


def maybe_str_get_all_between(x, a, b):


Loading…
Cancel
Save