Fun With Copyright Renewal Records
Based on an enormous amount of work by contributors to Project Gutenberg and the Distributed Proofreaders, combined with healthy sourcing of the US copyright office’s records, Google has compiled a a list of works originally copyrighted between 1923 and 1963 which have been renewed at some point, the upshot being that if a given work published in that time span is not on the list, it’s likely in the public domain.
One problem with the list that the database is a 370+ megabyte XML file, which is hard to load up in an XML-aware editor and even caused eXist to choke. So I broke it up into chunks with a shortish Groovy script, for neat ingestion into an XML database. The heart of the script is a SAX handler that basically churns each record in the XML file into a Groovy object, and a closure (there’s that word again!) that handles each record as it is constructed. As written, the script simply breaks the big file into a bunch of files, one for each year (you will of course have to edit the paths). By supplying a different closure, you could do all sorts of different things with the records, e.g. stuff them into a relational database.
In the spirit of the thing, the script is in the public domain — but I make no representations as to the quality, idiomaticity or overall efficiency of the script; despite being SAX-based, it still manages to chew up quite a bit of memory, so watch out. Note that you will need Apache Commons Lang (say, version 2.4) on the classpath (e.g. in $HOME/.groovy/lib) for this script to work. Developed with Groovy 1.5.6.
I’ve tried to stop wordpress from ‘prettyfying’ the output, which appears to mangle quotes. I hope to have that fixed soon …
import org.xml.sax.helpers.DefaultHandler
import org.xml.sax.Attributes
import org.xml.sax.helpers.XMLReaderFactory
import org.xml.sax.InputSource
import org.apache.commons.lang.StringEscapeUtils
import org.xml.sax.Locator
/**
* Represents an individual <Record> element
* in the document.
**/
class Record {
def file
def lines
def recno
def title
def copyrightYear
def copyrights = []
def renewalYear
def renewals = []
// where it was published
def published
// rare!
def note
// source of the copyright info
def source
def snippet
def md5sum
// contributors, holders, and pseudonyms
def people = []
/**
* Get the XML representing this element. Note
* that proper functioning here depends on how the
* handler builds the elements.
* @return a string containing this record's XML.
*/
def xml() {
def buf = new StringBuffer()
buf << """
<Record>
<Title>${title}</Title>
<File>${file}</File>
<Lines>${lines}</Lines>
<MD5Sum>${md5sum}</MD5Sum>
"""
if (snippet) {
buf << "\t<Snippet>${snippet}</Snippet>\n"
}
if (note) {
buf <<"\t<Note>${note}</Note>\n"
}
buf <<
"""
<Source>${source}</Source>
<CopyrightYear>${copyrightYear}</CopyrightYear>
<RenewalYear>${renewalYear}</RenewalYear>
"""
copyrights.each() {
buf << it.xml()
}
renewals.each() {
buf << it.xml()
}
people.each() {
buf << it.xml()
}
buf << "</Record>\n"
return buf.toString()
}
}
/**
* An inelegant class representing the elements that denote
* people (copyright holders, contributors, aliases, etc.)
**/
class Person {
static ELEMENTS = ["Holder" : [ "Name", "Type" ],
"Contrib" : [ "Name", "Role" ],
"Pseudonym" : [ "Pseudo", "Real" ],
"Neenym" : [ "Nee", "Now" ],
"Aka" : [ "Alias", "Real" ] ]
static ROLES = ELEMENTS.keySet()
def role
def name
def honorific
def type
def xml() {
def firstElement = ELEMENTS[role][0]
def secondElement = ELEMENTS[role][1]
def buf = new StringBuffer()
buf << """
<${role}>
<${firstElement}>${name}</${firstElement}>
<${secondElement}>$type</${secondElement}>"""
if ( honorific ) {
buf << "\t<Hon>${honorific}</Hon>\n"
}
buf << "</${role}>\n"
return buf.toString()
}
}
/**
* Represents copyright and renewal date elements.
*/
class RecordDate {
static ELEMENTS = ["Copyright", "Renewal"]
def role
def date
def id
def xml() {
return """<${role}>
<Date>${date}</Date>
<Id>${id}</Id>
</${role}>"""
}
}
/**
* SAX handler that turns each <code>Record</code> element
* into a <code>Record</code> domain object.
**/
class RecordHandler extends DefaultHandler {
/**
* Stack of strings that represents the current
* element context.
**/
Stack context = new Stack()
/**
* the current record being built.
**/
Record currentRec
/**
* the current Person element being built.
**/
Person currentPerson
/**
* The current date information being collected.
**/
RecordDate currentRecDate
/**
* A closure which will be called as each record is
* read in.
**/
def recordListener
/**
* a buffer to collect the current text, since SAX might
* not report all contiguous chunks of text at once.
**/
StringBuilder currentText = new StringBuilder()
def locator
@Override
public void setDocumentLocator(Locator locator)
{ println "Got a locator: ${locator}"
this.locator = locator
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
{
context << localName
switch( localName ) {
case "Record":
currentRec = new Record()
break
case Person.ROLES:
currentPerson = new Person()
currentPerson.role = localName
break
case RecordDate.ELEMENTS:
currentRecDate = new RecordDate()
currentRecDate.role = localName
break
}
}
@Override
public void characters(char [] ch, int start, int len)
{
currentText.append(ch,start,len)
}
@Override
public void endElement(String uri, String localName, String qName)
{
String txt = StringEscapeUtils.escapeXml(currentText.toString().trim())
switch(localName) {
case Person.ROLES:
currentRec.people << currentPerson
break
case ["Type", "Role", "Real", "Now"]:
currentPerson.type = txt
break
case ["Name", "Pseudo", "Nee", "Alias"]:
currentPerson.name = txt
break
case "Hon":
currentPerson.honorific = txt
break;
case "CopyrightYear":
currentRec.copyrightYear = Integer.parseInt(txt)
break
case "Date":
currentRecDate.date = txt
break
case "Id":
currentRecDate.id = txt
break
case "Copyright":
currentRec.copyrights <<currentRecDate
break
case "RenewalYear":
currentRec.renewalYear = Integer.parseInt(txt)
break
case "Renewal":
currentRec.renewals << currentRecDate
break
case "Recno":
currentRec.recno = txt
break
case "Source":
currentRec.source = txt
break
case "Lines":
currentRec.lines = txt
break
case "MD5Sum":
currentRec.md5sum = txt
break
case "File":
currentRec.file = txt
break
case "Snippet":
currentRec.snippet = txt
break
case "Title":
currentRec.title = txt
break
case "Published":
currentRec.published = txt
break
case "Record":
recordListener(currentRec)
break
case "Note":
currentRec.note = txt
break
case "CopyrightRenewalRecords":
break
default:
println "Unrecognized element '${localName}' at line ${locator.lineNumber}"
System.exit(1)
}
currentText.length = 0
}
}
def file = new File("input-dir/google-renewals-20080624/google-renewals-20080624.xml")
/**
* A listener that will output each record into a different stream depending
* on the CopyrightYear of the record.
**/
def listenerBase = { Map streams, Record it ->
if ( !streams.containsKey(it.copyrightYear) ) {
def f = new File("/output/dir/copyright-${it.copyrightYear}.xml")
println "creating ${f.absolutePath}"
def stream = f.newWriter()
streams[it.copyrightYear] = stream
stream.append("<CopyrightRenewalRecords>")
}
Writer s = (Writer)streams[it.copyrightYear]
s.append(it.xml())
s.flush()
}
def reader = XMLReaderFactory.createXMLReader()
def handler = new RecordHandler()
def outputStreams = [:]
handler.recordListener = listenerBase.curry(outputStreams)
reader.setContentHandler( handler )
try {
reader.parse( new InputSource( file.newInputStream() ) )
} catch (Exception x) {
x.printStackTrace()
println "Error at line ${handler.locator.lineNumber}"
}
outputStreams.each() {
k, BufferedOutputStream v ->
println "Closing ${k}"
v.append("</CopyrightRenewalRecords>")
v.flush()
v.close()
}
Posted: July 1st, 2008 under RDF, Tools, conferences, nerdination.
Write a comment
You need to login to post comments!