fetch-europarl.sh 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. #!/bin/sh
  2. cd "$(dirname "${0}")"
  3. # uconv --version >/dev/null || { case "$(uname -s)" in
  4. # Darwin) # http://apple.stackexchange.com/a/213163
  5. # echo "\$ brew icu4c && cd ~/bin && ln \`find /usr/local/Cellar -name uconv | tail -n 1\` && cd -" ;;
  6. # *)
  7. # echo "\$ apt-get install libicu-dev" ;;
  8. # esac && exit 1 ; }
  9. rapper -v >/dev/null || { echo "\$ apt-get install raptor-utils" 1>&2 && exit 1; }
  10. xsltproc --version > /dev/null || { echo "\$ apt-get install xsltproc" 1>&2 && exit 1; }
  11. xmllint --version 2>/dev/null || { echo "\$ apt-get install libxml2" 1>&2 && exit 1; }
  12. ruby -v >/dev/null || { echo "Install ruby" 1>&2 && exit 1; }
  13. dst="build/europarl"
  14. if [ ! -r "${dst}.2.xml" ] ; then
  15. echo "fetching EuroParl xml"
  16. curl --silent --output "${dst}.0.xml" --time-cond "${dst}.xml" --remote-time "http://www.europarl.europa.eu/meps/de/xml.html?country=&mepId=&politicalGroup=&bodyType=ALL&bodyValue=&countryCircons="
  17. # uconv -x any-nfc < "${dst}.0.xml" > "${dst}.1.xml" # http://unix.stackexchange.com/a/90164
  18. xmllint --output "${dst}.2.xml" --relaxng meps.rng --format --encode utf-8 "${dst}.0.xml"
  19. fi
  20. xsltproc europarl2foaf.xslt "${dst}.0.xml" | rapper --quiet --input rdfxml --output rdfxml-abbrev - http://www.europarl.europa.eu/ > "${dst}.rdf"
  21. rapper --quiet --input rdfxml --output turtle "${dst}.rdf" http://www.europarl.europa.eu/ > "${dst}.ttl"
  22. ls -l "${dst}.rdf" "${dst}.ttl"
  23. echo "processing MEP names"
  24. ruby fiddle-europarl-names.rb | rapper --quiet -g --output turtle - "http://www.europarl.europa.eu/" > "${dst}.names.ttl"
  25. rapper --quiet --input turtle --output rdfxml-abbrev "${dst}.names.ttl" "http://www.europarl.europa.eu/" > "${dst}.names.rdf"
  26. echo "fetching all MEP html pages"
  27. fgrep '<foaf:Person ' "${dst}.rdf" | cut -d '"' -f 2 | cut -d '/' -f 2 | while read mep_id_num
  28. do
  29. url="http://www.europarl.europa.eu/meps/de/${mep_id_num}/mep_home.html"
  30. mep_id_url="http://www.europarl.europa.eu/meps/${mep_id_num}"
  31. trgt="${dst}.mep.${mep_id_num}.rdf"
  32. [ -r "${trgt}" ] || {
  33. xsltproc --output "${trgt}" --html --stringparam mep_id_url "${mep_id_url}" "mep2foaf.xslt" "${url}" 2> /dev/null
  34. printf "."
  35. }
  36. done
  37. for rdf in "${dst}.mep."*.rdf
  38. do
  39. rapper --quiet --input rdfxml --output turtle "${rdf}" > "${rdf}.ttl"
  40. done
  41. cat "${dst}.mep."*.rdf.ttl > "${dst}.detail.ttl~"
  42. rapper --quiet --input turtle --output turtle "${dst}.detail.ttl~" http://www.europarl.europa.eu/ > "${dst}.detail.ttl"
  43. rapper --quiet --input turtle --output rdfxml-abbrev "${dst}.detail.ttl~" http://www.europarl.europa.eu/ > "${dst}.detail.rdf"
  44. ls -l "${dst}".??? "${dst}".names.??? "${dst}".detail.???