A framework for quick web archiving
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

96 řádky
3.3 KiB

  1. import argparse
  2. import asyncio
  3. import importlib.util
  4. import logging
  5. import os.path
  6. import qwarc
  7. import qwarc.version
  8. import sys
  9. import time
  10. class Formatter(logging.Formatter):
  11. def format(self, record):
  12. if not hasattr(record, 'itemString'):
  13. if hasattr(record, 'itemType') and hasattr(record, 'itemValue'):
  14. record.itemString = f'{record.itemType}:{record.itemValue}'
  15. else:
  16. record.itemString = 'None'
  17. return super().format(record)
  18. def setup_logging(logFilename):
  19. rootLogger = logging.getLogger()
  20. rootLogger.handlers = []
  21. rootLogger.setLevel(logging.INFO)
  22. formatter = Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(itemString)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
  23. formatter.converter = time.gmtime
  24. fileHandler = logging.FileHandler(logFilename)
  25. fileHandler.setFormatter(formatter)
  26. rootLogger.addHandler(fileHandler)
  27. stderrHandler = logging.StreamHandler()
  28. stderrHandler.setFormatter(formatter)
  29. rootLogger.addHandler(stderrHandler)
  30. def check_files(specFilename, logFilename):
  31. success = True
  32. if not os.path.isfile(specFilename):
  33. print(f'Error: "{specFilename}" does not exist or is not a regular file', file = sys.stderr)
  34. success = False
  35. if os.path.exists(logFilename):
  36. print(f'Error: "{logFilename}" already exists', file = sys.stderr)
  37. success = False
  38. if os.path.exists('STOP'):
  39. print('Error: "STOP" exists', file = sys.stderr)
  40. success = False
  41. return success
  42. def main():
  43. parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  44. parser.add_argument('--version', action = 'version', version = f'qwarc {qwarc.version.__version__}')
  45. parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log')
  46. parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db')
  47. parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc')
  48. parser.add_argument('--concurrency', type = int, default = 1)
  49. parser.add_argument('--memorylimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0)
  50. parser.add_argument('--disklimit', metavar = 'LIMIT', type = int, help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0)
  51. parser.add_argument('--warcsplit', metavar = 'SIZE', type = int, help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0)
  52. parser.add_argument('--warcdedupe', action = 'store_true', help = 'enable deduplication of WARC records')
  53. parser.add_argument('specfile')
  54. args = parser.parse_args()
  55. if not check_files(args.specfile, args.log):
  56. sys.exit(1)
  57. setup_logging(args.log)
  58. spec = importlib.util.spec_from_file_location('spec', args.specfile)
  59. specMod = importlib.util.module_from_spec(spec)
  60. spec.loader.exec_module(specMod)
  61. a = qwarc.QWARC(
  62. itemClasses = qwarc.Item.__subclasses__(),
  63. warcBasePath = args.warc,
  64. dbPath = args.database,
  65. concurrency = args.concurrency,
  66. memoryLimit = args.memorylimit,
  67. minFreeDisk = args.disklimit,
  68. warcSizeLimit = args.warcsplit,
  69. warcDedupe = args.warcdedupe,
  70. )
  71. if not os.path.exists(args.database):
  72. a.create_db()
  73. loop = asyncio.get_event_loop()
  74. try:
  75. loop.run_until_complete(a.run(loop))
  76. except (Exception, KeyboardInterrupt) as e:
  77. logging.exception('Unhandled error')
  78. loop.close()