瀏覽代碼

Write meta WARC with log file

tags/v0.2.0
JustAnotherArchivist 4 年之前
父節點
當前提交
e99e2304c9
共有 3 個檔案被更改,包括 71 行新增14 行删除
  1. +2
    -13
      qwarc/cli.py
  2. +15
    -0
      qwarc/utils.py
  3. +54
    -1
      qwarc/warc.py

+ 2
- 13
qwarc/cli.py 查看文件

@@ -4,19 +4,9 @@ import importlib.util
import logging
import os.path
import qwarc
import qwarc.utils
import qwarc.version
import sys
import time


class Formatter(logging.Formatter):
def format(self, record):
if not hasattr(record, 'itemString'):
if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
record.itemString = f'{record.itemType}:{record.itemValue}'
else:
record.itemString = 'None'
return super().format(record)


def setup_logging(logFilename):
@@ -24,8 +14,7 @@ def setup_logging(logFilename):
rootLogger.handlers = []
rootLogger.setLevel(logging.INFO)

formatter = Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
formatter.converter = time.gmtime
formatter = qwarc.utils.LogFormatter()

fileHandler = logging.FileHandler(logFilename)
fileHandler.setFormatter(formatter)


+ 15
- 0
qwarc/utils.py 查看文件

@@ -6,6 +6,7 @@ import logging
import os
import pkg_resources
import platform
import time


PAGESIZE = os.sysconf('SC_PAGE_SIZE')
@@ -215,3 +216,17 @@ def get_software_info():
},
'self': [{"package": package, "version": version} for package, version in _get_dependency_versions(__package__)],
}


class LogFormatter(logging.Formatter):
def __init__(self):
super().__init__('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
self.converter = time.gmtime

def format(self, record):
if not hasattr(record, 'itemString'):
if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
record.itemString = f'{record.itemType}:{record.itemValue}'
else:
record.itemString = 'None'
return super().format(record)

+ 54
- 1
qwarc/warc.py 查看文件

@@ -1,8 +1,11 @@
import fcntl
import gzip
import io
import json
import logging
import os
import qwarc.utils
import tempfile
import time
import warcio

@@ -28,6 +31,19 @@ class WARC:
self._dedupe = dedupe
self._dedupeMap = {}

self._logFile = None
self._logHandler = None
self._setup_logger()

def _setup_logger(self):
rootLogger = logging.getLogger()
formatter = qwarc.utils.LogFormatter()
self._logFile = tempfile.NamedTemporaryFile(prefix = 'qwarc-warc-', suffix = '.log.gz', delete = False)
self._logHandler = logging.StreamHandler(io.TextIOWrapper(gzip.GzipFile(filename = self._logFile.name, mode = 'wb'), encoding = 'utf-8'))
self._logHandler.setFormatter(formatter)
rootLogger.addHandler(self._logHandler)
self._logHandler.setLevel(logging.INFO)

def _ensure_opened(self):
'''Open the next file that doesn't exist yet if there is currently no file opened'''

@@ -116,7 +132,7 @@ class WARC:
if self._maxFileSize and self._file.tell() > self._maxFileSize:
self.close()

def close(self):
def _close_file(self):
'''Close the currently opened WARC'''

if not self._closed:
@@ -124,3 +140,40 @@ class WARC:
self._warcWriter = None
self._file = None
self._closed = True

def _write_meta_warc(self):
filename = f'{self._prefix}-meta.warc.gz'
#TODO: Handle OSError on fcntl.flock and retry
self._file = open(filename, 'ab')
try:
fcntl.flock(self._file.fileno(), fcntl.LOCK_EX)
logging.info(f'Opened {filename}')
self._warcWriter = warcio.warcwriter.WARCWriter(self._file, gzip = True)
self._closed = False

self.write_warcinfo_record()

self._logHandler.flush()
self._logHandler.stream.close()
record = self._warcWriter.create_warc_record(
'urn:qwarc:log',
'resource',
payload = gzip.GzipFile(self._logFile.name),
warc_headers_dict = {'Content-Type': 'text/plain; charset=utf-8'},
)
self._warcWriter.write_record(record)
finally:
self._close_file()

def close(self):
'''Clean up everything.'''
self._close_file()
self._write_meta_warc()
logging.getLogger().removeHandler(self._logHandler)
try:
os.remove(self._logFile.name)
except OSError:
logging.error('Could not remove temporary log file')
self._logFile = None
self._logHandler.close()
self._logHandler = None

Loading…
取消
儲存