Marcus Rohrmoser 2 years ago
parent
commit
761a9c79d0
5 changed files with 155 additions and 15 deletions
  1. 6 0
      README.md
  2. 8 7
      europarl2foaf.xslt
  3. 46 8
      fetch-europarl.sh
  4. 30 0
      fiddle-europarl-names.rb
  5. 65 0
      mep2foaf.xslt

+ 6 - 0
README.md

@@ -0,0 +1,6 @@
+
+    $ gem install unicode # capitalize the MEP names in (fiddle-europarl-names.rb via fetch-europarl.sh)
+
+    $ ./fetch-europarl.sh
+
+    $ ./fetch-votes.sh

+ 8 - 7
europarl2foaf.xslt

@@ -6,7 +6,7 @@
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
   xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
   xmlns:dctype="http://purl.org/dc/dcmitype/"
-  xmlns:dc="http://purl.org/dc/terms/"
+  xmlns:dct="http://purl.org/dc/terms/"
   xmlns:foaf="http://xmlns.com/foaf/0.1/"
   xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
   xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
@@ -103,13 +103,14 @@
           </xsl:choose>
         </xsl:variable>
 
-        <foaf:Group rdf:about="http://www.lexvo.org/page/iso3166/{$iso3166}">
+        <foaf:Group rdf:about="http://www.lexvo.org/id/iso3166/{$iso3166}">
+          <dct:type rdf:resource="http://www.lexvo.org/id/iso3166"/>
           <foaf:name xml:lang="de"><xsl:value-of select="country"/></foaf:name>
-          <!-- foaf:member rdf:resource="{$mep_id}"/ -->
+          <foaf:member rdf:resource="{$mep_id}"/>
           <foaf:member>
             <foaf:Group rdf:about="http://www.europarl.europa.eu/nationalPoliticalGroups/{nationalPoliticalGroup}">
               <foaf:name><xsl:value-of select="nationalPoliticalGroup"/></foaf:name>
-              <dc:type rdf:resource="http://www.europarl.europa.eu/nationalPoliticalGroups/"/>
+              <dct:type rdf:resource="http://www.europarl.europa.eu/nationalPoliticalGroups/"/>
               <foaf:member rdf:resource="{$mep_id}"/>
             </foaf:Group>
           </foaf:member>
@@ -120,15 +121,15 @@
           <foaf:member>
             <foaf:Group rdf:about="http://www.europarl.europa.eu/politicalGroups/{$fraktion_id}">
               <!-- foaf:name xml:lang="de"><xsl:value-of select="politicalGroup"/></foaf:name -->
-              <dc:type rdf:resource="http://www.europarl.europa.eu/politicalGroups/"/>
-              <dc:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#integer"><xsl:value-of select="$fraktion_id"/></dc:identifier>
+              <dct:type rdf:resource="http://www.europarl.europa.eu/politicalGroups/"/>
+              <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#integer"><xsl:value-of select="$fraktion_id"/></dct:identifier>
               <foaf:member rdf:resource="{$mep_id}"/>
             </foaf:Group>
           </foaf:member>
         </foaf:Group>
 
         <foaf:Person rdf:about="{$mep_id}">
-          <dc:type rdf:resource="http://www.europarl.europa.eu/meps/"/>
+          <dct:type rdf:resource="http://www.europarl.europa.eu/meps/"/>
           <foaf:workInfoHomepage rdf:resource="http://www.europarl.europa.eu/meps/de/{id}/mep_home.html"/>
           <foaf:name><xsl:value-of select="fullName"/></foaf:name>
           <foaf:depiction rdf:resource="http://www.europarl.europa.eu/mepphoto/{id}.jpg"/>

+ 46 - 8
fetch-europarl.sh

@@ -1,14 +1,52 @@
 #!/bin/sh
-
 cd "$(dirname "${0}")"
 
-rapper -v >/dev/null            || { echo "Install raptor-utils" 1>&2 && exit 1; }
-xsltproc --version > /dev/null  || { echo "Install xsltproc" 1>&2 && exit 1; }
-xmllint --version 2>/dev/null   || { echo "Install libxml2" 1>&2 && exit 1; }
+# uconv --version >/dev/null      || { case "$(uname -s)" in
+# Darwin) # http://apple.stackexchange.com/a/213163
+#   echo "\$ brew icu4c && cd ~/bin && ln \`find /usr/local/Cellar -name uconv | tail -n 1\` && cd -" ;;
+# *)
+#   echo "\$ apt-get install libicu-dev" ;;
+# esac && exit 1 ; }
+rapper -v >/dev/null            || { echo "\$ apt-get install raptor-utils" 1>&2 && exit 1; }
+xsltproc --version > /dev/null  || { echo "\$ apt-get install xsltproc" 1>&2 && exit 1; }
+xmllint --version 2>/dev/null   || { echo "\$ apt-get install libxml2" 1>&2 && exit 1; }
+ruby -v >/dev/null              || { echo "Install ruby" 1>&2 && exit 1; }
 
 dst="build/europarl"
 
-curl --output "${dst}.xml" --time-cond "${dst}.xml" --remote-time "http://www.europarl.europa.eu/meps/de/xml.html?country=&mepId=&politicalGroup=&bodyType=ALL&bodyValue=&countryCircons="
-xmllint --output "${dst}.1.xml" --relaxng meps.rng --format --encode utf-8 "${dst}.xml"
-xsltproc europarl2foaf.xslt "${dst}.xml" | rapper -i rdfxml -o rdfxml-abbrev - http://www.europarl.europa.eu/ > "${dst}.rdf"
-rapper -i rdfxml -o turtle "${dst}.rdf" http://www.europarl.europa.eu/ > "${dst}.ttl"
+if [ ! -r "${dst}.2.xml" ] ; then
+  echo "fetching EuroParl xml"
+  curl --silent --output "${dst}.0.xml" --time-cond "${dst}.xml" --remote-time "http://www.europarl.europa.eu/meps/de/xml.html?country=&mepId=&politicalGroup=&bodyType=ALL&bodyValue=&countryCircons="
+  # uconv -x any-nfc < "${dst}.0.xml" > "${dst}.1.xml" # http://unix.stackexchange.com/a/90164
+  xmllint --output "${dst}.2.xml" --relaxng meps.rng --format --encode utf-8 "${dst}.0.xml"
+fi
+xsltproc europarl2foaf.xslt "${dst}.0.xml" | rapper --quiet --input rdfxml --output rdfxml-abbrev - http://www.europarl.europa.eu/ > "${dst}.rdf"
+rapper --quiet --input rdfxml --output turtle "${dst}.rdf" http://www.europarl.europa.eu/ > "${dst}.ttl"
+ls -l "${dst}.rdf" "${dst}.ttl"
+
+echo "processing MEP names"
+ruby fiddle-europarl-names.rb | rapper --quiet -g --output turtle - "http://www.europarl.europa.eu/" > "${dst}.names.ttl"
+rapper --quiet --input turtle --output rdfxml-abbrev "${dst}.names.ttl" "http://www.europarl.europa.eu/" > "${dst}.names.rdf"
+
+echo "fetching all MEP html pages"
+fgrep '<foaf:Person ' "${dst}.rdf" | cut -d '"' -f 2 | cut -d '/' -f 2 | while read mep_id_num
+do
+  url="http://www.europarl.europa.eu/meps/de/${mep_id_num}/mep_home.html"
+  mep_id_url="http://www.europarl.europa.eu/meps/${mep_id_num}"
+
+  trgt="${dst}.mep.${mep_id_num}.rdf"
+  [ -r "${trgt}" ] || {
+    xsltproc --output "${trgt}" --html --stringparam mep_id_url "${mep_id_url}" "mep2foaf.xslt" "${url}" 2> /dev/null
+    printf "."
+  }
+done
+
+for rdf in "${dst}.mep."*.rdf
+do
+  rapper --quiet --input rdfxml --output turtle "${rdf}" > "${rdf}.ttl"
+done
+
+cat "${dst}.mep."*.rdf.ttl > "${dst}.detail.ttl~"
+rapper --quiet --input turtle --output turtle "${dst}.detail.ttl~" http://www.europarl.europa.eu/ > "${dst}.detail.ttl"
+rapper --quiet --input turtle --output rdfxml-abbrev "${dst}.detail.ttl~" http://www.europarl.europa.eu/ > "${dst}.detail.rdf"
+ls -l "${dst}".??? "${dst}".names.??? "${dst}".detail.???

+ 30 - 0
fiddle-europarl-names.rb

@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+require "rexml/document"
+require "unicode"
+
+file_name = File.join(File.dirname(__FILE__), "build", "europarl.rdf")
+xml = REXML::Document.new( File.new( file_name ) )
+
+puts <<ENDS
+@base <#{xml.root.attributes["xml:base"]}> .
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+
+ENDS
+
+xml.elements.each("/rdf:RDF/foaf:Person/foaf:name") do |element|
+  about_url = element.parent.attributes["rdf:about"].to_s
+
+  given, family = [], []
+  Unicode::compose(element.text().to_s).strip.split(/\s+/).reverse.each do |s|
+    if given.length == 0 && s == Unicode::upcase(s)
+      family << Unicode::capitalize(s)
+    else
+      given << s
+    end
+  end
+
+  puts "<#{about_url}>"
+  puts "  a foaf:Person ;"
+  puts "  foaf:givenName \"\"\"#{given.reverse.join(' ')}\"\"\" ;"
+  puts "  foaf:familyName \"\"\"#{family.reverse.join(' ')}\"\"\" ."
+end

+ 65 - 0
mep2foaf.xslt

@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ http://www.w3.org/TR/xslt/
+-->
+<xsl:stylesheet
+  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+  xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
+  xmlns:dctype="http://purl.org/dc/dcmitype/"
+  xmlns:dct="http://purl.org/dc/terms/"
+  xmlns:foaf="http://xmlns.com/foaf/0.1/"
+  xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:xlink="http://www.w3.org/1999/xlink"
+  version="1.0">
+
+  <xsl:output method="xml" indent="yes"/>
+
+  <xsl:param name="mep_id_url"></xsl:param>
+  <xsl:param name="language"></xsl:param>
+
+  <xsl:template match="/">
+    <rdf:RDF>
+      <xsl:for-each select=".//ul[@class = 'link_collection_noborder']">
+        <foaf:Person rdf:about="{$mep_id_url}">
+          <xsl:for-each select=".//a[@id = 'email-0']">
+            <xsl:variable name="url" select="concat('mailto:', normalize-space(substring-after(@href, 'mailto:')))"/>
+            <foaf:email rdf:resource="{$url}"/>
+          </xsl:for-each>
+          <xsl:for-each select=".//a[@class = 'link_website']">
+            <foaf:homepage rdf:resource="{normalize-space(@href)}"/>
+          </xsl:for-each>
+        </foaf:Person>
+        <xsl:for-each select=".//a[@class = 'link_fb']">
+          <xsl:variable name="url" select="concat('http', normalize-space(substring-after(@href, 'http')))"/>
+          <foaf:Person rdf:about="{$mep_id_url}">
+            <foaf:online rdf:resource="{$url}"/>
+          </foaf:Person>
+          <foaf:OnlineAccount rdf:about="{$url}">
+            <foaf:accountServiceHomepage rdf:resource="https://facebook.com/"/>
+          </foaf:OnlineAccount>
+        </xsl:for-each>
+        <xsl:for-each select=".//a[@class = 'link_twitt']">
+          <xsl:variable name="url" select="concat('http', normalize-space(substring-after(@href, 'http')))"/>
+          <foaf:Person rdf:about="{$mep_id_url}">
+            <foaf:online rdf:resource="{$url}"/>
+          </foaf:Person>
+          <foaf:OnlineAccount rdf:about="{$url}">
+            <foaf:accountServiceHomepage rdf:resource="https://twitter.com/"/>
+          </foaf:OnlineAccount>
+        </xsl:for-each>
+      </xsl:for-each>
+
+      <foaf:Person rdf:about="{$mep_id_url}">
+        <xsl:for-each select=".//span[@class = 'phone']">
+          <foaf:phone rdf:resource="tel:{translate(normalize-space(.), ' ', '-')}"/>
+        </xsl:for-each>
+        <xsl:for-each select=".//span[@class = 'more_info' and starts-with(normalize-space(.), 'geboren') ]">
+          <foaf:birthday><xsl:value-of select="normalize-space(substring-before(substring-after(., 'geboren am'), ','))"/></foaf:birthday>
+          <dct:spatial><xsl:value-of select="normalize-space(substring-after(., ','))"/></dct:spatial>
+        </xsl:for-each>
+      </foaf:Person>
+    </rdf:RDF>
+  </xsl:template>
+
+</xsl:stylesheet>