Browse Source

"Freeze" log file object before writing to WARC to ensure that further log messages aren't picked up

This is a workaround for https://github.com/webrecorder/warcio/issues/90
master
JustAnotherArchivist 3 years ago
parent
commit
dbe1ed71ab
2 changed files with 45 additions and 1 deletions
  1. +42
    -0
      qwarc/utils.py
  2. +3
    -1
      qwarc/warc.py

+ 42
- 0
qwarc/utils.py View File

@@ -279,6 +279,48 @@ class ReadonlyFileView:
return getattr(self._fp, key)


class FrozenFileView:
'''
A poor minimal frozen view for a file object. It fixes the bounds of the file, i.e. if something is appended to the underlying file object, it does not become visible in the frozen view. Only seek, tell, and read are implemented.

Note that seeks and reads will affect the underlying file object. The actual data is not really frozen either, and any changes on the underlying file object will affect the frozen view as well.
'''

def __init__(self, fp, begin, end):
'''
fp: file-like object
begin: int, offset from beginning of the file
end: int, offset from beginning of the file
'''

self._fp = fp
self._begin = begin
self._end = end

def seek(self, offset, whence = os.SEEK_SET):
if whence == os.SEEK_SET:
return self._fp.seek(self._begin + offset, whence)
elif whence == os.SEEK_CUR:
return self._fp.seek(offset, whence)
elif whence == os.SEEK_END:
return self._fp.seek(self._end + offset, whence)
raise NotImplementedError

def tell(self):
return self._fp.tell() - self._begin

def read(self, size = -1):
curPos = self._fp.tell()
if curPos < self._begin:
self._fp.seek(self._begin)
elif curPos > self._end:
return self._fp.read(0)

if size == -1:
return self._fp.read(self._end - self._fp.tell())
return self._fp.read(min(size, self._end - self._fp.tell()))


class DummyClientResponse:
'''A ClientResponse-like object for when no actual ClientResponse is available. Always evaluates to False when cast to a bool.'''



+ 3
- 1
qwarc/warc.py View File

@@ -231,10 +231,12 @@ class WARC:
fp.seek(0, io.SEEK_END)
length = fp.tell()
fp.seek(0)
# Work around https://github.com/webrecorder/warcio/issues/90
payload = qwarc.utils.FrozenFileView(fp, 0, length)
record = self._warcWriter.create_warc_record(
f'file://{self._logFilename}',
'resource',
payload = fp,
payload = payload,
length = length,
warc_headers_dict = {'X-QWARC-Type': 'log', 'Content-Type': 'text/plain; charset=utf-8', 'WARC-Warcinfo-ID': self._metaWarcinfoRecordID},
)


Loading…
Cancel
Save