Browse Source

correlate votes. a bit. at least.

Marcus Rohrmoser 2 years ago
parent
commit
6676af1c0a
7 changed files with 372 additions and 5 deletions
  1. 1 4
      .gitignore
  2. 37 0
      daytoc2rdf.xslt
  3. 1 1
      fetch.sh
  4. 34 0
      fetch-votes.sh
  5. 112 0
      pdftotext-postprocess.rb
  6. 24 0
      votes-pdf2xml.sh
  7. 163 0
      votes-xml-to-ttl.rb

+ 1 - 4
.gitignore

@@ -1,4 +1 @@
-europarl.1.xml
-europarl.rdf
-europarl.ttl
-europarl.xml
+build/

+ 37 - 0
daytoc2rdf.xslt

@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ http://www.w3.org/TR/xslt/
+-->
+<xsl:stylesheet
+  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+  xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
+  xmlns:dctype="http://purl.org/dc/dcmitype/"
+  xmlns:dc="http://purl.org/dc/terms/"
+  xmlns:foaf="http://xmlns.com/foaf/0.1/"
+  xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:xlink="http://www.w3.org/1999/xlink"
+  xmlns:iso639-1="http://lexvo.org/id/iso639-1/"
+  version="1.0">
+
+  <xsl:output method="xml" indent="yes"/>
+
+  <xsl:param name="day"></xsl:param>
+  <xsl:param name="language"></xsl:param>
+
+  <xsl:template match="/">
+    <rdf:RDF xml:base="http://www.europarl.europa.eu/" xml:lang="{$language}">
+      <xsl:for-each select=".//a[not(@class) and contains(@href,'getDoc.do?pubRef=-//EP//TEXT')]">
+        <!-- xsl:comment> zzz </xsl:comment -->
+        <rdf:Description rdf:about="{@href}">
+          <dc:title><xsl:value-of select="normalize-space(.)"/></dc:title>
+          <dc:language rdf:resource="http://lexvo.org/id/iso639-1/{$language}"/>
+          <dc:date rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">
+            <xsl:value-of select="substring($day, 1, 4)"/>-<xsl:value-of select="substring($day, 5, 2)"/>-<xsl:value-of select="substring($day, 7, 2)"/>
+          </dc:date>
+        </rdf:Description>
+      </xsl:for-each>
+    </rdf:RDF>
+  </xsl:template>
+
+</xsl:stylesheet>

+ 1 - 1
fetch.sh

@@ -6,7 +6,7 @@ rapper -v >/dev/null            || { echo "Install raptor-utils" 1>&2 && exit 1;
 xsltproc --version > /dev/null  || { echo "Install xsltproc" 1>&2 && exit 1; }
 xmllint --version 2>/dev/null   || { echo "Install libxml2" 1>&2 && exit 1; }
 
-dst="europarl"
+dst="build/europarl"
 
 curl --output "${dst}.xml" --time-cond "${dst}.xml" --remote-time "http://www.europarl.europa.eu/meps/de/xml.html?country=&mepId=&politicalGroup=&bodyType=ALL&bodyValue=&countryCircons="
 xmllint --output "${dst}.1.xml" --relaxng meps.rng --format --encode utf-8 "${dst}.xml"

+ 34 - 0
fetch-votes.sh

@@ -0,0 +1,34 @@
+#!/bin/sh
+
+cd "$(dirname "${0}")"
+
+# Sitzungstag: http://www.europarl.europa.eu/sides/getDoc.do?type=TA&reference=20170215&secondRef=TOC&language=DE
+#
+
+build="build"
+
+for day in \
+  20170124 20170125 20170126 20170127 20170128 \
+  20170131 20170201 20170202 20170203 20170204 \
+  20170207 20170208 20170209 20170210 20170211 \
+  20170214 20170215 20170216 20170217 20170218
+do
+  for language in de en fr
+  do
+    dst="${build}/${day}@${language}"
+    echo "${dst}"
+    xsltproc --output "${dst}.rdf" --html \
+       --stringparam "day" "${day}" --stringparam "language" "${language}" \
+      daytoc2rdf.xslt "http://www.europarl.europa.eu/sides/getDoc.do?type=TA&reference=${day}&secondRef=TOC&language=${language}" \
+    && rapper --quiet --input rdfxml --output turtle "${dst}.rdf" "http://www.europarl.europa.eu/" > "${dst}.ttl"
+  done
+
+  if [ -r "${build}/${day}@de.ttl" ] ; then
+    dst="${build}/${day}"
+    # Votes, see e.g. http://www.europarl.europa.eu/plenary/de/minutes.html
+    url="http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+PV+${day}+RES-RCV+DOC+PDF+V0//DE&language=DE"
+    echo "${url}"
+    curl --silent --output "${dst}.pdf" --time-cond "${dst}.pdf" --remote-time "${url}"
+  fi
+done
+

+ 112 - 0
pdftotext-postprocess.rb

@@ -0,0 +1,112 @@
+#!/usr/bin/env ruby
+require 'unicode'
+#
+# 
+#
+#
+txt = Unicode::compose(ARGF.read)
+
+###########################################################
+### Pagebreaks
+txt.gsub!("", "")
+
+###########################################################
+### Page footer
+txt.gsub!(/[A-Z]\d_[A-Z]{2}\(\d{4}\)\d{2}-\d{2}\([A-Z]{3}\)_[A-Z]{2}\.docx
++\d+
++[A-Z]{2} \d{3}\.\d{3}/, "")
+
+###########################################################
+### Corrections marker
+txt.gsub!(/ПОПРАВКИ В ПОДАДЕНИТЕ ГЛАСОВЕ И НАМЕРЕНИЯ ЗА ГЛАСУВАНЕ.+?RÖSTER/m, "
+<!-- \\0 -->
+")
+
+### fix sequence
+txt.gsub!(/
+(\d+)
++(\d{2}\/\d{2}\/\d{4} \d{2}:\d{2}:\d{2}.\d{3})/, "
+\\2
+
+\\1")
+
+### fix sequence
+txt.gsub!(/
+([A-Z]\d\-\d{4}\/\d{4}\s*-\s*[^\n]+)
++(\d+\.)/, "
+\\2
+
+\\1")
+
+###########################################################
+### real votes block
+txt.gsub!(/(\d+)
+
+([0+-])/, "
+</fraction>
+</votes>
+<votes type='\\2' count='\\1'>
+<fraction>")
+
+###########################################################
+### one fraction's votes
+txt.gsub!(/\n([A-Za-z\/\&]+):[ \n]*/, "
+</fraction>
+<fraction name='\\1'>")
+
+###########################################################
+### one agenda item
+txt.gsub!(/(\d+)\.
++([A-Z]\d\-\d{4}\/\d{4})\s*-\s*([^\n]+)
++(\d{2})\/(\d{2})\/(\d{4}) (\d{2}:\d{2}:\d{2}).\d{3}/, "
+</fraction>
+</votes>
+</item>
+<item id='\\2' remark='\\3' num='\\1' type='b' timestamp='\\6-\\5-\\4T\\7'>
+<fraction>")
+
+###########################################################
+### vote corrections
+txt.gsub!(/^([0+-])$/, "
+</fraction>
+</votes>
+<votes type='\\1'>
+<fraction name='correction'>
+")
+
+###########################################################
+### collapse multiple linefeeds
+txt.gsub!(/\n\s*\n\s*/, "\n")
+
+###########################################################
+### append closing xml tags
+txt <<= "</fraction>
+</votes>
+</item>"
+
+###########################################################
+### clean/collapse some closing xml tags
+txt.gsub!(/<fraction>
+<\/fraction>
+<\/votes>
+/, "")
+
+###########################################################
+### clean/collapse some closing xml tags
+txt.gsub!(/<fraction>
+<\/fraction>
+/, "")
+
+###########################################################
+### clean/collapse some closing xml tags
+txt.gsub!(/<\/fraction>
+<\/item>/, "</fraction>
+</votes><!-- uhu -->
+</item>")
+
+txt.gsub!("S&D", "S&amp;D")
+txt.sub!("<item", "--><item")
+
+puts "<session><!-- "
+puts txt
+puts "</session>"

+ 24 - 0
votes-pdf2xml.sh

@@ -0,0 +1,24 @@
+#!/bin/sh
+cd "$(dirname "${0}")"
+
+rapper -v >/dev/null            || { echo "Install raptor-utils" 1>&2 && exit 1; }
+ruby -v >/dev/null              || { echo "Install ruby" 1>&2 && exit 1; }
+pdftotext -v 2>/dev/null        || { echo "Install poppler" 1>&2 && exit 1; }
+xmllint --version 2>/dev/null   || { echo "Install libxml2" 1>&2 && exit 1; }
+
+build="build"
+
+for pdf in "${build}/"*.pdf
+do
+  day="${build}/$(basename "${pdf}" .pdf)"
+  echo "${day}"
+  pdftotext "${pdf}"
+
+  ruby pdftotext-postprocess.rb < "${day}.txt" \
+  | xmllint --output "${day}.xml" --format --encode utf-8 -
+
+  ruby votes-xml-to-ttl.rb <  "${day}.xml" > "${day}.ttl~"
+
+  rapper --input turtle --output turtle "${day}.ttl~" "http://www.europarl.europa.eu/" > "${day}.ttl"
+  rapper --input turtle --output rdfxml-abbrev "${day}.ttl~" "http://www.europarl.europa.eu/" > "${day}.rdf"
+done

+ 163 - 0
votes-xml-to-ttl.rb

@@ -0,0 +1,163 @@
+#!/usr/bin/env ruby
+require "rexml/document"
+require "unicode"
+
+# https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Ruby
+def levenshtein(first, second)
+  m, n = first.length, second.length
+  return m if n == 0
+  return n if m == 0
+
+  # Create our distance matrix
+  d = Array.new(m+1) {Array.new(n+1)}
+  0.upto(m) { |i| d[i][0] = i }
+  0.upto(n) { |j| d[0][j] = j }
+
+  1.upto(n) do |j|
+    1.upto(m) do |i|
+      d[i][j] = first[i-1] == second[j-1] ? d[i-1][j-1] : [d[i-1][j]+1,d[i][j-1]+1,d[i-1][j-1]+1,].min
+    end
+  end
+  d[m][n]
+end
+
+class String
+  def norm
+    gsub(/[ \r\n\t]+/, " ")
+  end
+  def comp
+    Unicode::downcase(norm).gsub(/[\s-]/,'').gsub(/ß/,'ss')
+  end
+end
+
+
+file_name = File.join(File.dirname(__FILE__), "build", "europarl.rdf")
+
+@upper_name2id = {}
+
+begin
+  file = File.new( file_name )
+  meps = REXML::Document.new( file )
+  id2group = {}
+
+  meps.elements.each("/rdf:RDF/foaf:Group[dc:type/@rdf:resource = 'politicalGroups/']/foaf:member") do |element|
+    id2group[ element.attributes["rdf:resource"].to_s.norm ] = element.parent.attributes["rdf:about"].to_s.norm
+  end
+
+  meps.elements.each("/rdf:RDF/foaf:Person/foaf:name") do |element|
+    id = element.parent.attributes["rdf:about"].to_s.norm
+    name = element.text().to_s.norm
+    given, family = [], []
+    name.split(/\s+/).reverse.each do |s|
+      if given.length == 0 && s != Unicode::upcase(s)
+        given << s
+      else
+        family << Unicode::capitalize(s)
+      end
+    end
+    h = {
+      :mep_id => id,
+      :group => id2group[ id ],
+      :name => name,
+      :idx_name => family.reverse.join(' ').comp,
+      :familyName => family.reverse.join(' '),
+      :givenName  => given.reverse.join(' '),
+    }
+    @upper_name2id[ name ] = h
+  end
+end
+
+# puts @upper_name2id
+# exit
+
+# build a map of lowercase name -> id, (evtl. fold name to ascii)
+@lut = {}
+def find(name)
+  n = name.comp
+  t = @lut[n]
+  return t unless t.nil?
+
+  @upper_name2id.each do |un,h|
+    return t = @lut[n] = h if h[:idx_name] == n
+  end
+
+  @upper_name2id.each do |un,h|
+    return t = @lut[n] = h if n == [ h[:familyName], h[:givenName] ].flatten.join(' ').comp
+  end
+
+  @upper_name2id.each do |un,h|
+    return t = @lut[n] = h if n == [ h[:givenName], h[:familyName] ].flatten.join(' ').comp
+  end
+
+  @upper_name2id.each do |un,h|
+    return t = @lut[n] = h if h[:name].comp.end_with? n
+  end
+
+#   lev = 10000
+#   hh = nil
+#   @upper_name2id.each do |un,h|
+#     l = levenshtein(n, h[:idx_name])
+#     if l < lev
+#       lev = l
+#       hh = h
+#     end
+#   end
+#   return t = @lut[n] = hh if lev <= 1
+
+  # puts "? '#{name}' => '#{hh[:name]}' (#{lev})"
+  # $stderr.puts "? '#{name}'"
+  @lut[n] = :fail
+end
+
+puts <<ENDS
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix xdt: <http://www.w3.org/2005/xpath-datatypes#> .
+@prefix dct: <http://purl.org/dc/terms/> .
+@prefix dctype: <http://purl.org/dc/dcmitype/> .
+@prefix vote: <http://www.europarl.europa.eu/vote/> .
+
+ENDS
+
+# crawl through the votes xml and replace name with meps/id
+date = nil
+votes = REXML::Document.new( $stdin )
+votes.elements.each("/session/item") do |item|
+  timestamp = item.attribute('timestamp').to_s
+  day = timestamp.gsub(/^(\d{4})-?(\d{2})-?(\d{2}).*$/, "\\1\\2\\3") if date.nil?
+
+  num = item.attribute('num').to_s
+  pdf_url = "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+PV+#{day}+RES-RCV+DOC+PDF+V0//DE&language=DE\##{num}"
+  title = item.attribute('remark').to_s
+  auth_str = title.split('-')[0]
+  auth_mep = find auth_str
+  unless auth_mep == :fail
+    puts "<#{pdf_url}> dct:creator <http://www.europarl.europa.eu/#{auth_mep[:mep_id]}> ."
+  else
+    $stderr.puts "! Author not found: '#{auth_str}'"
+  end
+
+  az = item.attribute('id')
+  $stderr.puts "#{az}"
+  az_url = "http://www.europarl.europa.eu/sides/getDoc.do?reference=#{az}&language=DE"
+  puts "<#{pdf_url}> dct:date \"#{timestamp}\"^^xsd:dateTime ."
+  puts "<#{pdf_url}> dct:references <#{az_url}> ."
+  puts "<#{pdf_url}> dct:title \"\"\"#{title}\"\"\" ."
+  puts "<#{pdf_url}> dct:identifier #{num} ."
+  item.elements.each("votes/fraction") do |element|
+    element.text().to_s.split(",").collect{|s| s.strip}.each do |nom|
+      next if nom == ''
+      r = find nom.norm
+      if r == :fail
+        $stderr.puts "! Voter lookup failed for: '#{nom}'"
+      else
+        vote = case element.parent().attribute('type').to_s
+          when '-' then 'vote:gotAgainstFrom'
+          when '+' then 'vote:gotForFrom'
+          when '0' then 'vote:gotAbstentionFrom'
+        end
+        # puts "<http://www.europarl.europa.eu/#{r[:mep_id]}> <http://www.europarl.europa.eu/vote/#{vote}> <http://www.europarl.europa.eu/sides/getDoc.do?reference=#{az}&language=DE> ."
+        puts "<#{pdf_url}> #{vote} <http://www.europarl.europa.eu/#{r[:mep_id]}> ."
+      end
+    end
+  end
+end