comb.go 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. //
  2. // Copyright (C) 2017-2018 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. //
  17. package main
  18. import (
  19. "io"
  20. "net/url"
  21. "strings"
  22. "time"
  23. "github.com/yhat/scrape"
  24. "golang.org/x/net/html"
  25. "golang.org/x/net/html/atom"
  26. )
  27. var serverLocation *time.Location
  28. func init() {
  29. // TODO rather use app settings?
  30. serverLocation, _ = time.LoadLocation("Europe/Berlin")
  31. }
  32. func entryFromURL(ur *url.URL, timeout time.Duration) (Entry, error) {
  33. if r, err := HttpGetBody(ur, timeout); err != nil {
  34. return Entry{}, err
  35. } else {
  36. return entryFromReader(r, ur)
  37. }
  38. }
  39. func entryFromReader(r io.Reader, ur *url.URL) (Entry, error) {
  40. if root, err := html.Parse(r); err != nil {
  41. return Entry{}, err
  42. } else {
  43. return entryFromNode(root, ur)
  44. }
  45. }
  46. func entryFromNode(root *html.Node, ur *url.URL) (Entry, error) {
  47. ret := Entry{}
  48. for _, node := range scrape.FindAll(root, func(n *html.Node) bool {
  49. return n.Parent == root && n.Type == html.ElementNode && atom.Html == n.DataAtom
  50. }) {
  51. ret.XmlLang = scrape.Attr(node, "lang")
  52. break
  53. }
  54. for _, node := range scrape.FindAll(root, func(n *html.Node) bool { return n.Type == html.ElementNode && atom.Meta == n.DataAtom }) {
  55. strName := scrape.Attr(node, "name")
  56. strProp := scrape.Attr(node, "property")
  57. strContent := scrape.Attr(node, "content")
  58. switch {
  59. case "title" == strName:
  60. ret.Title = HumanText{Body: strContent}
  61. case "description" == strName:
  62. ret.Summary = &HumanText{Body: strContent}
  63. case "author" == strName:
  64. ret.Authors = append(ret.Authors, Person{Name: strContent})
  65. case "date" == strName:
  66. var t time.Time
  67. var err error
  68. if t, err = time.Parse(time.RFC3339, strContent); err != nil {
  69. if t, err = time.ParseInLocation("2006-01-02T15:04:05Z0700", strContent, serverLocation); err != nil {
  70. if t, err = time.ParseInLocation("2006-01-02T15:04:05", strContent, serverLocation); err != nil {
  71. //panic(err)
  72. }
  73. }
  74. }
  75. if err == nil {
  76. ret.Published = iso8601(t)
  77. }
  78. case "keywords" == strName:
  79. for _, txt := range strings.Split(strContent, ",") {
  80. if t := strings.Replace(strings.TrimSpace(txt), " ", "_", -1); "" != t {
  81. ret.Categories = append(ret.Categories, Category{Term: t})
  82. }
  83. }
  84. case "og:title" == strProp:
  85. ret.Title = HumanText{Body: strContent}
  86. case "og:description" == strProp:
  87. ret.Summary = &HumanText{Body: strContent}
  88. case nil == ret.MediaThumbnail && "og:image" == strProp:
  89. ret.MediaThumbnail = &MediaThumbnail{Url: strContent}
  90. }
  91. }
  92. return ret, nil
  93. }