"""
Created by Mark on 2008-02-12.
Copyright (c) 2008 The Choate Group, LLC.
All rights reserved.
Redistribution and use in source and binary
forms, with or without modification,
are permitted provided that the following
conditions are met:
Redistributions of source code must retain
the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce
the above copyright notice, this list
of conditions and the following disclaimer in
the documentation and/or other
materials provided with the distribution.
Neither the name of The Choate Group nor the
names of its contributors may be used
to endorse or promote products derived from this
software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS
AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import sys
import os
import urllib
import urllib2
import rdflib
import rdflib.store.SQLite
import glob
from string import Template
api_uri = "http://api.opencalais.com/enlighten/calais.asmx/Enlighten"
parameters = """<c:params xmlns:c="http://s.opencalais.com/1/pred/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:mw="http://choate.info/metawrite/1.0/">
<c:processingDirectives c:contentType="$contentType" c:outputFormat="$outputFormat">
</c:processingDirectives> <c:userDirectives c:allowDistribution="false"
c:allowSearch="false" c:externalID="$guid" c:submitter="Metawrite">
</c:userDirectives><c:externalMetadata>
<mw:title>$title</mw:title>
</c:externalMetadata></c:params>
"""
def getParams(contentType="text/html", outputFormat="xml/rdf", externalID="", title=""):
d = {
'contentType':contentType,
'outputFormat':outputFormat,
'guid': externalID,
'title': title
}
t = Template(parameters)
print t.substitute(d)
return t.substitute(d)
def getCalaisMetadata(licenseID="", content = "", params=""):
d = {}
d["licenseID"] = licenseID
d["content"] = content
d["paramsXML"] = params
r = urllib2.Request(api_uri)
r.add_data(urllib.urlencode(d))
res = urllib2.urlopen(r)
return res.read()
def getSqliteRdfRepository(path_to_file="", file_name=""):
"""
This works for me. It's not how the documentation says to do it. I don't
use the plugins module in rdflib, which as far as I can tell shouldn't
have any impact...but you never know.
When you instantiate the store, you need to pass an identifier, which for
SQLite, just happens to be the file name. When you open the store, you
need to pass the configuration, which for SQLite is just a path to the
directory that holds the file.
The current version of rdflib expects SQLite 2 -- but Python 2.5 uses SQLite 3.
As a consequence, I have to explicity import dbapi2 from SQLite 3 in the
rdflib distribution.
"""
store = rdflib.store.SQLite.SQLite(identifier=file_name)
try:
store.open(path_to_file, create=True)
except:
store.open(path_to_file, create=False)
g = rdflib.Graph(store)
return g
if __name__ == '__main__':
"""
The content can be no longer than 100,000 characters and it must be one
of the following types: text/html, txt/txt and text/xml. The type must
match the type encoded in the paramsXML value (see above). At this point
(Feb, 2008), the API is very finicky and will generate errors if there's
even a minor problem. If you use text/xml as the format, then you can use
the following tags:
title, headline, header, body, description, content, date, datetime,
dateandtime, pubdate
Finally, the content must be in English (for now).
"""
content = """
This is sample content about Raleigh, North Carolina.
My phone number is (919) 555-1212.
"""
params = getParams(externalID="Dummy", title="Test Document")
myResults = getCalaisMetadata("", content, params)
g = getSqliteRdfRepository("/Users/mchoate/Documents/Code/django/", "testrdf3.db")
os.chdir("/Users/mchoate/Desktop/wikinews/")
file_list = glob.glob("*.xml")
for f in file_list:
g.parse(f)
g.commit()
print g.all_nodes()