Procházet zdrojové kódy

Use base64-encoded headers.

master
Alard před 11 roky
rodič
revize
01fc97cde7
2 změnil soubory, kde provedl 22 přidání a 6 odebrání
  1. +13
    -4
      megawarc
  2. +9
    -2
      megawarc-fix

+ 13
- 4
megawarc Zobrazit soubor

@@ -37,9 +37,12 @@ One line with a JSON object per file in the .tar.
"header_fields": { "header_fields": {
... (parsed fields from the tar header) ... (parsed fields from the tar header)
}, },
"header_string": string (the tar header for this entry)
"header_base64": string (the base64-encoded tar header)
} }


In older megawarcs the header is sometimes not base64-encoded:
"header_string": string (the tar header for this entry)



USAGE USAGE
----- -----
@@ -57,6 +60,7 @@ megawarc restore FILE
It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE. It reads FILE.warc.gz, FILE.tar and FILE.json.gz to make FILE.
""" """


import base64
import gzip import gzip
import json import json
import os.path import os.path
@@ -219,7 +223,7 @@ class MegawarcBuilder(object):
d["target"] = d_target d["target"] = d_target
d["src_offsets"] = d_src_offsets d["src_offsets"] = d_src_offsets
d["header_fields"] = entry.get_info("utf-8", {}) d["header_fields"] = entry.get_info("utf-8", {})
d["header_string"] = tar_header
d["header_base64"] = base64.b64encode(tar_header)


# store metadata # store metadata
json.dump(d, json_out, separators=(',', ':')) json.dump(d, json_out, separators=(',', ':'))
@@ -338,7 +342,7 @@ class MegawarcPacker(object):
d["target"] = d_target d["target"] = d_target
d["src_offsets"] = d_src_offsets d["src_offsets"] = d_src_offsets
d["header_fields"] = entry.get_info("utf-8", {}) d["header_fields"] = entry.get_info("utf-8", {})
d["header_string"] = tar_header
d["header_base64"] = base64.b64encode(tar_header)


# store metadata # store metadata
json.dump(d, json_out, separators=(',', ':')) json.dump(d, json_out, separators=(',', ':'))
@@ -371,7 +375,12 @@ class MegawarcRestorer(object):
if entry["target"]["container"] == "warc": if entry["target"]["container"] == "warc":
if self.verbose: if self.verbose:
print >>sys.stderr, "Copying %s from warc" % entry["header_fields"]["name"] print >>sys.stderr, "Copying %s from warc" % entry["header_fields"]["name"]
tar_out.write(entry["header_string"])
if "header_base64" in entry:
tar_out.write(base64.b64decode(entry["header_base64"]))
elif "header_string" in entry:
tar_out.write(entry["header_string"])
else:
raise Exception("Missing header_string or header_base64.")
copy_to_stream(tar_out, self.input_warc_filename, copy_to_stream(tar_out, self.input_warc_filename,
entry["target"]["offset"], entry["target"]["size"]) entry["target"]["offset"], entry["target"]["size"])
padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE


+ 9
- 2
megawarc-fix Zobrazit soubor

@@ -9,6 +9,7 @@
# ./megawarc-fix BASENAME # ./megawarc-fix BASENAME
# where BASENAME is the part before .megawarc.(warc.gz|json.gz|tar) # where BASENAME is the part before .megawarc.(warc.gz|json.gz|tar)
# #
import base64
import gzip import gzip
import json import json
import os.path import os.path
@@ -161,7 +162,10 @@ class MegawarcFixer(object):
block_size = (tarfile.BLOCKSIZE + # header block_size = (tarfile.BLOCKSIZE + # header
entry["target"]["size"] + # data entry["target"]["size"] + # data
(tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE) (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE)
tar_out.write(entry["header_string"])
if "header_base64" in entry:
tar_out.write(base64.b64decode(entry["header_base64"]))
elif "header_string" in entry:
tar_out.write(entry["header_string"])
copy_to_stream(tar_out, self.input_warc_filename, copy_to_stream(tar_out, self.input_warc_filename,
entry["target"]["offset"], entry["target"]["size"]) entry["target"]["offset"], entry["target"]["size"])
padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE padding = (tarfile.BLOCKSIZE - entry["target"]["size"]) % tarfile.BLOCKSIZE
@@ -192,7 +196,10 @@ class MegawarcFixer(object):
d["target"] = d_target d["target"] = d_target
d["src_offsets"] = entry["src_offsets"] d["src_offsets"] = entry["src_offsets"]
d["header_fields"] = entry["header_fields"] d["header_fields"] = entry["header_fields"]
d["header_string"] = entry["header_string"]
if "header_base64" in entry:
d["header_base64"] = entry["header_base64"]
elif "header_string" in entry:
d["header_base64"] = base64.b64encode(entry["header_string"])


json.dump(d, json_out, separators=(',', ':')) json.dump(d, json_out, separators=(',', ':'))
json_out.write("\n") json_out.write("\n")


Načítá se…
Zrušit
Uložit