Based on an enormous amount of work by contributors to Project Gutenberg and the Distributed Proofreaders, combined with healthy sourcing of the US copyright office’s records, Google has compiled a a list of works originally copyrighted between 1923 and 1963 which have been renewed at some point, the upshot being that if a given work published in that time span is not on the list, it’s likely in the public domain.
One problem with the list that the database is a 370+ megabyte XML file, which is hard to load up in an XML-aware editor and even caused eXist to choke. So I broke it up into chunks with a shortish Groovy script, for neat ingestion into an XML database. The heart of the script is a SAX handler that basically churns each record in the XML file into a Groovy object, and a closure (there’s that word again!) that handles each record as it is constructed. As written, the script simply breaks the big file into a bunch of files, one for each year (you will of course have to edit the paths). By supplying a different closure, you could do all sorts of different things with the records, e.g. stuff them into a relational database.
In the spirit of the thing, the script is in the public domain — but I make no representations as to the quality, idiomaticity or overall efficiency of the script; despite being SAX-based, it still manages to chew up quite a bit of memory, so watch out. Note that you will need Apache Commons Lang (say, version 2.4) on the classpath (e.g. in $HOME/.groovy/lib) for this script to work. Developed with Groovy 1.5.6.
I’ve tried to stop wordpress from ‘prettyfying’ the output, which appears to mangle quotes. I hope to have that fixed soon …
import org.xml.sax.helpers.DefaultHandler
import org.xml.sax.Attributes
import org.xml.sax.helpers.XMLReaderFactory
import org.xml.sax.InputSource
import org.apache.commons.lang.StringEscapeUtils
import org.xml.sax.Locator
/**
* Represents an individual <Record> element
* in the document.
**/
class Record {
def file
def lines
def recno
def title
def copyrightYear
def copyrights = []
def renewalYear
def renewals = []
// where it was published
def published
// rare!
def note
// source of the copyright info
def source
def snippet
def md5sum
// contributors, holders, and pseudonyms
def people = []
/**
* Get the XML representing this element. Note
* that proper functioning here depends on how the
* handler builds the elements.
* @return a string containing this record's XML.
*/
def xml() {
def buf = new StringBuffer()
buf << """
<Record>
<Title>${title}</Title>
<File>${file}</File>
<Lines>${lines}</Lines>
<MD5Sum>${md5sum}</MD5Sum>
"""
if (snippet) {
buf << "\t<Snippet>${snippet}</Snippet>\n"
}
if (note) {
buf <<"\t<Note>${note}</Note>\n"
}
buf <<
"""
<Source>${source}</Source>
<CopyrightYear>${copyrightYear}</CopyrightYear>
<RenewalYear>${renewalYear}</RenewalYear>
"""
copyrights.each() {
buf << it.xml()
}
renewals.each() {
buf << it.xml()
}
people.each() {
buf << it.xml()
}
buf << "</Record>\n"
return buf.toString()
}
}
/**
* An inelegant class representing the elements that denote
* people (copyright holders, contributors, aliases, etc.)
**/
class Person {
static ELEMENTS = ["Holder" : [ "Name", "Type" ],
"Contrib" : [ "Name", "Role" ],
"Pseudonym" : [ "Pseudo", "Real" ],
"Neenym" : [ "Nee", "Now" ],
"Aka" : [ "Alias", "Real" ] ]
static ROLES = ELEMENTS.keySet()
def role
def name
def honorific
def type
def xml() {
def firstElement = ELEMENTS[role][0]
def secondElement = ELEMENTS[role][1]
def buf = new StringBuffer()
buf << """
<${role}>
<${firstElement}>${name}</${firstElement}>
<${secondElement}>$type</${secondElement}>"""
if ( honorific ) {
buf << "\t<Hon>${honorific}</Hon>\n"
}
buf << "</${role}>\n"
return buf.toString()
}
}
/**
* Represents copyright and renewal date elements.
*/
class RecordDate {
static ELEMENTS = ["Copyright", "Renewal"]
def role
def date
def id
def xml() {
return """<${role}>
<Date>${date}</Date>
<Id>${id}</Id>
</${role}>"""
}
}
/**
* SAX handler that turns each <code>Record</code> element
* into a <code>Record</code> domain object.
**/
class RecordHandler extends DefaultHandler {
/**
* Stack of strings that represents the current
* element context.
**/
Stack context = new Stack()
/**
* the current record being built.
**/
Record currentRec
/**
* the current Person element being built.
**/
Person currentPerson
/**
* The current date information being collected.
**/
RecordDate currentRecDate
/**
* A closure which will be called as each record is
* read in.
**/
def recordListener
/**
* a buffer to collect the current text, since SAX might
* not report all contiguous chunks of text at once.
**/
StringBuilder currentText = new StringBuilder()
def locator
@Override
public void setDocumentLocator(Locator locator)
{ println "Got a locator: ${locator}"
this.locator = locator
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
{
context << localName
switch( localName ) {
case "Record":
currentRec = new Record()
break
case Person.ROLES:
currentPerson = new Person()
currentPerson.role = localName
break
case RecordDate.ELEMENTS:
currentRecDate = new RecordDate()
currentRecDate.role = localName
break
}
}
@Override
public void characters(char [] ch, int start, int len)
{
currentText.append(ch,start,len)
}
@Override
public void endElement(String uri, String localName, String qName)
{
String txt = StringEscapeUtils.escapeXml(currentText.toString().trim())
switch(localName) {
case Person.ROLES:
currentRec.people << currentPerson
break
case ["Type", "Role", "Real", "Now"]:
currentPerson.type = txt
break
case ["Name", "Pseudo", "Nee", "Alias"]:
currentPerson.name = txt
break
case "Hon":
currentPerson.honorific = txt
break;
case "CopyrightYear":
currentRec.copyrightYear = Integer.parseInt(txt)
break
case "Date":
currentRecDate.date = txt
break
case "Id":
currentRecDate.id = txt
break
case "Copyright":
currentRec.copyrights <<currentRecDate
break
case "RenewalYear":
currentRec.renewalYear = Integer.parseInt(txt)
break
case "Renewal":
currentRec.renewals << currentRecDate
break
case "Recno":
currentRec.recno = txt
break
case "Source":
currentRec.source = txt
break
case "Lines":
currentRec.lines = txt
break
case "MD5Sum":
currentRec.md5sum = txt
break
case "File":
currentRec.file = txt
break
case "Snippet":
currentRec.snippet = txt
break
case "Title":
currentRec.title = txt
break
case "Published":
currentRec.published = txt
break
case "Record":
recordListener(currentRec)
break
case "Note":
currentRec.note = txt
break
case "CopyrightRenewalRecords":
break
default:
println "Unrecognized element '${localName}' at line ${locator.lineNumber}"
System.exit(1)
}
currentText.length = 0
}
}
def file = new File("input-dir/google-renewals-20080624/google-renewals-20080624.xml")
/**
* A listener that will output each record into a different stream depending
* on the CopyrightYear of the record.
**/
def listenerBase = { Map streams, Record it ->
if ( !streams.containsKey(it.copyrightYear) ) {
def f = new File("/output/dir/copyright-${it.copyrightYear}.xml")
println "creating ${f.absolutePath}"
def stream = f.newWriter()
streams[it.copyrightYear] = stream
stream.append("<CopyrightRenewalRecords>")
}
Writer s = (Writer)streams[it.copyrightYear]
s.append(it.xml())
s.flush()
}
def reader = XMLReaderFactory.createXMLReader()
def handler = new RecordHandler()
def outputStreams = [:]
handler.recordListener = listenerBase.curry(outputStreams)
reader.setContentHandler( handler )
try {
reader.parse( new InputSource( file.newInputStream() ) )
} catch (Exception x) {
x.printStackTrace()
println "Error at line ${handler.locator.lineNumber}"
}
outputStreams.each() {
k, BufferedOutputStream v ->
println "Closing ${k}"
v.append("</CopyrightRenewalRecords>")
v.flush()
v.close()
}