Site menu:

Tags

Recent Comments

Site search

Categories

July 2008
M T W T F S S
« Apr   Oct »
 123456
78910111213
14151617181920
21222324252627
28293031  

Tags

Blogroll

Fun With Copyright Renewal Records

Based on an enormous amount of work by contributors to Project Gutenberg and the Distributed Proofreaders, combined with healthy sourcing of the US copyright office’s records, Google has compiled a a list of works originally copyrighted between 1923 and 1963 which have been renewed at some point, the upshot being that if a given work published in that time span is not on the list, it’s likely in the public domain.

One problem with the list that the database is a 370+ megabyte XML file, which is hard to load up in an XML-aware editor and even caused eXist to choke.  So I broke it up into chunks with a shortish Groovy script, for neat ingestion into an XML database.  The heart of the script is a SAX handler that basically churns each record in the XML file into a Groovy object, and a closure (there’s that word again!) that handles each record as it is constructed.  As written, the script simply breaks the big file into a bunch of files, one for each year (you will of course have to edit the paths).  By supplying a different closure, you could do all sorts of different things with the records, e.g. stuff them into a relational database.

In the spirit of the thing, the script is in the public domain — but I make no representations as to the quality, idiomaticity or overall efficiency of the script; despite being SAX-based, it still manages to chew up quite a bit of memory, so watch out.  Note that you will need Apache Commons Lang (say, version 2.4) on the classpath (e.g. in $HOME/.groovy/lib) for this script to work. Developed with Groovy 1.5.6.

I’ve tried to stop wordpress from ‘prettyfying’ the output, which appears to mangle quotes. I hope to have that fixed soon …

import org.xml.sax.helpers.DefaultHandler
import org.xml.sax.Attributes
import org.xml.sax.helpers.XMLReaderFactory
import org.xml.sax.InputSource

import org.apache.commons.lang.StringEscapeUtils
import org.xml.sax.Locator

/**
 * Represents an individual <Record> element
 * in the document.
 **/
class Record {
    def file

    def lines

    def recno

    def title

   def copyrightYear

    def copyrights = []

    def renewalYear

    def renewals = [] 

    // where it was published
    def published

    // rare!
    def note

    // source of the copyright info
    def source
    def snippet
    def md5sum

    // contributors, holders, and pseudonyms
    def people = []

    /**
     * Get the XML representing this element.  Note
     * that proper functioning here depends on how the
     * handler builds the elements.
     * @return a string containing this record's XML.
     */
    def xml() {
        def buf = new StringBuffer()
        buf << """
<Record>
    <Title>${title}</Title>
    <File>${file}</File>
    <Lines>${lines}</Lines>
    <MD5Sum>${md5sum}</MD5Sum>
"""
        if (snippet) {
            buf << "\t<Snippet>${snippet}</Snippet>\n"
        }
        if (note) {
            buf <<"\t<Note>${note}</Note>\n"
        }
        buf <<
"""
    <Source>${source}</Source>
    <CopyrightYear>${copyrightYear}</CopyrightYear>
    <RenewalYear>${renewalYear}</RenewalYear>
"""
        copyrights.each() {
            buf << it.xml()
        }
        renewals.each() {
            buf << it.xml()
        }
        people.each() {
                buf << it.xml()
        }
        buf << "</Record>\n"
        return buf.toString()
    }
}

/**
 * An inelegant class representing the elements that denote
 * people (copyright holders, contributors, aliases, etc.)
 **/
class Person {

    static ELEMENTS = ["Holder" :   [ "Name", "Type" ],
                        "Contrib" : [ "Name", "Role" ],
                        "Pseudonym" : [ "Pseudo", "Real" ],
                        "Neenym" : [ "Nee", "Now" ],
                        "Aka" : [ "Alias", "Real" ] ]

    static ROLES = ELEMENTS.keySet()

    def role

    def name

    def honorific

    def type

    def xml() {
        def firstElement = ELEMENTS[role][0]
        def secondElement = ELEMENTS[role][1]
        def buf = new StringBuffer()

        buf << """
<${role}>
    <${firstElement}>${name}</${firstElement}>
    <${secondElement}>$type</${secondElement}>"""
    if ( honorific ) {
        buf << "\t<Hon>${honorific}</Hon>\n"
        }
    buf << "</${role}>\n"
    return buf.toString()
    }
}

/**
 * Represents copyright and renewal date elements.
 */
class RecordDate {

	static ELEMENTS = ["Copyright", "Renewal"]

    def role
    def date
    def id
    def xml() {
        return """<${role}>
    <Date>${date}</Date>
    <Id>${id}</Id>
</${role}>"""
    }
}

/**
 * SAX handler that turns each <code>Record</code> element
 * into a <code>Record</code> domain object.
 **/
class RecordHandler extends DefaultHandler {

    /**
     * Stack of strings that represents the current
     * element context.
     **/
    Stack context = new Stack()

    /**
     * the current record being built.
     **/
    Record currentRec

    /**
     * the current Person element being built.
     **/
    Person currentPerson

    /**
     * The current date information being collected.
     **/
    RecordDate currentRecDate

    /**
     * A closure which will be called as each record is
     * read in.
     **/
    def recordListener

    /**
     * a buffer to collect the current text, since SAX might
     * not report all contiguous chunks of text at once.
     **/
    StringBuilder currentText = new StringBuilder()

    def locator

    @Override
    public void setDocumentLocator(Locator locator)
    {    println "Got a locator: ${locator}"
        this.locator = locator
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes atts)
    {
        context << localName
        switch( localName ) {
            case "Record":
                currentRec = new Record()
                break
            case Person.ROLES:
                currentPerson = new Person()
                currentPerson.role = localName
                break
            case RecordDate.ELEMENTS:
                currentRecDate = new RecordDate()
                currentRecDate.role = localName
                break
        }
    }

    @Override
    public void characters(char [] ch, int start, int len)
    {
        currentText.append(ch,start,len)
    }

    @Override
    public void endElement(String uri, String localName, String qName)
    {
        String txt = StringEscapeUtils.escapeXml(currentText.toString().trim())
        switch(localName) {
            case Person.ROLES:
                currentRec.people << currentPerson
                break
            case ["Type", "Role", "Real", "Now"]:
                currentPerson.type = txt
                break
            case ["Name", "Pseudo", "Nee", "Alias"]:
                currentPerson.name = txt
                break
            case "Hon":
                currentPerson.honorific = txt
               break;
            case "CopyrightYear":
                currentRec.copyrightYear = Integer.parseInt(txt)
                break
            case "Date":
                currentRecDate.date = txt
                break
            case "Id":
                currentRecDate.id = txt
                break
            case "Copyright":
                currentRec.copyrights <<currentRecDate
                break
            case "RenewalYear":
                currentRec.renewalYear = Integer.parseInt(txt)
                break
            case "Renewal":
                currentRec.renewals << currentRecDate
                break
            case "Recno":
                currentRec.recno = txt
                break
            case "Source":
                currentRec.source = txt
                break
            case "Lines":
                currentRec.lines = txt
                break
            case "MD5Sum":
                currentRec.md5sum = txt
                break
            case "File":
                currentRec.file = txt
                break
            case "Snippet":
                currentRec.snippet = txt
                break
            case "Title":
                currentRec.title = txt
                break
            case "Published":
                currentRec.published = txt
                break
            case "Record":
                recordListener(currentRec)
                break
            case "Note":
                currentRec.note = txt
                break
            case "CopyrightRenewalRecords":
                break
            default:
                println "Unrecognized element '${localName}' at line ${locator.lineNumber}"
                System.exit(1)
            }
        currentText.length = 0
    }

}

def file = new File("input-dir/google-renewals-20080624/google-renewals-20080624.xml")

/**
 * A listener that will output each record into a different stream depending
 * on the CopyrightYear of the record.
 **/
def listenerBase = { Map streams, Record it ->
    if ( !streams.containsKey(it.copyrightYear) ) {
        def f = new File("/output/dir/copyright-${it.copyrightYear}.xml")
        println "creating ${f.absolutePath}"
        def stream = f.newWriter()
        streams[it.copyrightYear] = stream
        stream.append("<CopyrightRenewalRecords>")
    }
    Writer s = (Writer)streams[it.copyrightYear]
    s.append(it.xml())
    s.flush()
}

def reader = XMLReaderFactory.createXMLReader()
def handler = new RecordHandler()
def outputStreams = [:]
handler.recordListener = listenerBase.curry(outputStreams)
reader.setContentHandler( handler )

try {
    reader.parse( new InputSource( file.newInputStream() ) )
} catch (Exception x) {
    x.printStackTrace()
    println "Error at line ${handler.locator.lineNumber}"
}

outputStreams.each() {
    k, BufferedOutputStream v ->
        println "Closing ${k}"
        v.append("</CopyrightRenewalRecords>")
        v.flush()
        v.close()
}

Write a comment

You need to login to post comments!