From 0198b221a838cbd2779daf887f0b9d885e045a3d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 2 Jun 2021 21:46:39 +0000 Subject: [PATCH] Add ia-files-xml-to-jsonl --- ia-files-xml-to-jsonl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100755 ia-files-xml-to-jsonl diff --git a/ia-files-xml-to-jsonl b/ia-files-xml-to-jsonl new file mode 100755 index 0000000..28cab3f --- /dev/null +++ b/ia-files-xml-to-jsonl @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +import json +import sys +import xml.etree.ElementTree + + +root = xml.etree.ElementTree.fromstring(sys.stdin.read()) +assert root.tag == 'files' +for file in root: + assert file.tag == 'file' + attributes = file.attrib + childrenTags = [child.tag for child in file] + assert sorted(childrenTags) == sorted(set(childrenTags)), 'duplicate children' + children = {child.tag: child.text for child in file} + assert not any(k in children for k in attributes), 'attribute found in children' + assert not any(k in attributes for k in children), 'child found in attributes' + print(json.dumps({**attributes, **children}))