votes-xml-to-ttl.rb 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. #!/usr/bin/env ruby
  2. require "rexml/document"
  3. require "unicode"
  4. # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Ruby
  5. def levenshtein(first, second)
  6. m, n = first.length, second.length
  7. return m if n == 0
  8. return n if m == 0
  9. # Create our distance matrix
  10. d = Array.new(m+1) {Array.new(n+1)}
  11. 0.upto(m) { |i| d[i][0] = i }
  12. 0.upto(n) { |j| d[0][j] = j }
  13. 1.upto(n) do |j|
  14. 1.upto(m) do |i|
  15. d[i][j] = first[i-1] == second[j-1] ? d[i-1][j-1] : [d[i-1][j]+1,d[i][j-1]+1,d[i-1][j-1]+1,].min
  16. end
  17. end
  18. d[m][n]
  19. end
  20. class String
  21. def norm
  22. gsub(/[ \r\n\t]+/, " ")
  23. end
  24. def comp
  25. Unicode::downcase(norm).gsub(/[\s-]/,'').gsub(/ß/,'ss')
  26. end
  27. end
  28. file_name = File.join(File.dirname(__FILE__), "build", "europarl.rdf")
  29. @upper_name2id = {}
  30. begin
  31. file = File.new( file_name )
  32. meps = REXML::Document.new( file )
  33. id2group = {}
  34. meps.elements.each("/rdf:RDF/foaf:Group[dc:type/@rdf:resource = 'politicalGroups/']/foaf:member") do |element|
  35. id2group[ element.attributes["rdf:resource"].to_s.norm ] = element.parent.attributes["rdf:about"].to_s.norm
  36. end
  37. meps.elements.each("/rdf:RDF/foaf:Person/foaf:name") do |element|
  38. id = element.parent.attributes["rdf:about"].to_s.norm
  39. name = element.text().to_s.norm
  40. given, family = [], []
  41. name.split(/\s+/).reverse.each do |s|
  42. if given.length == 0 && s != Unicode::upcase(s)
  43. given << s
  44. else
  45. family << Unicode::capitalize(s)
  46. end
  47. end
  48. h = {
  49. :mep_id => id,
  50. :group => id2group[ id ],
  51. :name => name,
  52. :idx_name => family.reverse.join(' ').comp,
  53. :familyName => family.reverse.join(' '),
  54. :givenName => given.reverse.join(' '),
  55. }
  56. @upper_name2id[ name ] = h
  57. end
  58. end
  59. # puts @upper_name2id
  60. # exit
  61. # build a map of lowercase name -> id, (evtl. fold name to ascii)
  62. @lut = {}
  63. def find(name)
  64. n = name.comp
  65. t = @lut[n]
  66. return t unless t.nil?
  67. @upper_name2id.each do |un,h|
  68. return t = @lut[n] = h if h[:idx_name] == n
  69. end
  70. @upper_name2id.each do |un,h|
  71. return t = @lut[n] = h if n == [ h[:familyName], h[:givenName] ].flatten.join(' ').comp
  72. end
  73. @upper_name2id.each do |un,h|
  74. return t = @lut[n] = h if n == [ h[:givenName], h[:familyName] ].flatten.join(' ').comp
  75. end
  76. @upper_name2id.each do |un,h|
  77. return t = @lut[n] = h if h[:name].comp.end_with? n
  78. end
  79. # lev = 10000
  80. # hh = nil
  81. # @upper_name2id.each do |un,h|
  82. # l = levenshtein(n, h[:idx_name])
  83. # if l < lev
  84. # lev = l
  85. # hh = h
  86. # end
  87. # end
  88. # return t = @lut[n] = hh if lev <= 1
  89. # puts "? '#{name}' => '#{hh[:name]}' (#{lev})"
  90. # $stderr.puts "? '#{name}'"
  91. @lut[n] = :fail
  92. end
  93. puts <<ENDS
  94. @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
  95. @prefix xdt: <http://www.w3.org/2005/xpath-datatypes#> .
  96. @prefix dct: <http://purl.org/dc/terms/> .
  97. @prefix dctype: <http://purl.org/dc/dcmitype/> .
  98. @prefix vote: <http://www.europarl.europa.eu/vote/> .
  99. ENDS
  100. # crawl through the votes xml and replace name with meps/id
  101. date = nil
  102. votes = REXML::Document.new( $stdin )
  103. votes.elements.each("/session/item") do |item|
  104. timestamp = item.attribute('timestamp').to_s
  105. day = timestamp.gsub(/^(\d{4})-?(\d{2})-?(\d{2}).*$/, "\\1\\2\\3") if date.nil?
  106. num = item.attribute('num').to_s
  107. pdf_url = "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+PV+#{day}+RES-RCV+DOC+PDF+V0//DE&language=DE\##{num}"
  108. title = item.attribute('remark').to_s
  109. auth_str = title.split('-')[0]
  110. auth_mep = find auth_str
  111. unless auth_mep == :fail
  112. puts "<#{pdf_url}> dct:creator <http://www.europarl.europa.eu/#{auth_mep[:mep_id]}> ."
  113. else
  114. $stderr.puts "! Author not found: '#{auth_str}'"
  115. end
  116. az = item.attribute('id')
  117. $stderr.puts "#{az}"
  118. az_url = "http://www.europarl.europa.eu/sides/getDoc.do?reference=#{az}&language=DE"
  119. puts "<#{pdf_url}> dct:date \"#{timestamp}\"^^xsd:dateTime ."
  120. puts "<#{pdf_url}> dct:references <#{az_url}> ."
  121. puts "<#{pdf_url}> dct:title \"\"\"#{title}\"\"\" ."
  122. puts "<#{pdf_url}> dct:identifier #{num} ."
  123. item.elements.each("votes/fraction") do |element|
  124. element.text().to_s.split(",").collect{|s| s.strip}.each do |nom|
  125. next if nom == ''
  126. r = find nom.norm
  127. if r == :fail
  128. $stderr.puts "! Voter lookup failed for: '#{nom}'"
  129. else
  130. vote = case element.parent().attribute('type').to_s
  131. when '-' then 'vote:gotAgainstFrom'
  132. when '+' then 'vote:gotForFrom'
  133. when '0' then 'vote:gotAbstentionFrom'
  134. end
  135. # puts "<http://www.europarl.europa.eu/#{r[:mep_id]}> <http://www.europarl.europa.eu/vote/#{vote}> <http://www.europarl.europa.eu/sides/getDoc.do?reference=#{az}&language=DE> ."
  136. puts "<#{pdf_url}> #{vote} <http://www.europarl.europa.eu/#{r[:mep_id]}> ."
  137. end
  138. end
  139. end
  140. end