atom.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. //
  2. // Copyright (C) 2017-2019 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. //
  17. package main
  18. import (
  19. "encoding/base64"
  20. "encoding/binary"
  21. "encoding/xml"
  22. "errors"
  23. "fmt"
  24. "io"
  25. "os"
  26. "sort"
  27. "strconv"
  28. "strings"
  29. "time"
  30. // "golang.org/x/tools/blog/atom"
  31. "github.com/yhat/scrape"
  32. "golang.org/x/net/html"
  33. )
  34. const lengthyAtomPreambleComment string = `
  35. https://developer.mozilla.org/en/docs/XSL_Transformations_in_Mozilla_FAQ#Why_isn.27t_my_stylesheet_applied.3F
  36. Caution! Firefox ignores your XSLT stylesheet if your XML looks like a RSS or Atom feed. A typical workaround is to insert an XML comment at the beginning of your XML file to move the <fEEd or <rsS tag out of the first 512 bytes used by Firefox to guess whether it is a feed or not.
  37. See also the discussion at https://bugzilla.mozilla.org/show_bug.cgi?id=338621#c72.
  38. For best results, serve both atom feed and xslt as 'text/xml' or 'application/xml' without charset specified.
  39. `
  40. const atomNamespace = "http://www.w3.org/2005/Atom"
  41. func FeedFromFileName(file string) (Feed, error) {
  42. if read, err := os.Open(file); nil == read || nil != err {
  43. return Feed{}, err
  44. } else {
  45. defer read.Close()
  46. return FeedFromReader(read)
  47. }
  48. }
  49. func FeedFromReader(file io.Reader) (Feed, error) {
  50. ret := Feed{}
  51. err := xml.NewDecoder(file).Decode(&ret)
  52. return ret, err
  53. }
  54. type Iri string // https://tools.ietf.org/html/rfc3987
  55. type Id Iri // we allow relative Ids (in persistent store)
  56. type Lang string // https://tools.ietf.org/html/rfc3066
  57. type Relation string // https://www.iana.org/assignments/link-relations/link-relations.xhtml#link-relations-1
  58. type MimeType string // https://tools.ietf.org/html/rfc2045#section-5.1
  59. type TextType string // https://tools.ietf.org/html/rfc4287#section-4.1.3.1
  60. // https://mro.github.io/atomenabled.org/
  61. // https://tools.ietf.org/html/rfc4287#section-4.1.1
  62. //
  63. // see also https://godoc.org/golang.org/x/tools/blog/atom#Feed
  64. type Feed struct {
  65. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
  66. XmlBase Iri `xml:"xml:base,attr,omitempty"`
  67. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  68. XmlNSShaarliGo string `xml:"xmlns:sg,attr,omitempty"` // https://github.com/golang/go/issues/9519#issuecomment-252196382
  69. SearchTerms string `xml:"sg:searchTerms,attr,omitempty"` // rather use http://www.opensearch.org/Specifications/OpenSearch/1.1#Example_of_OpenSearch_response_elements_in_Atom_1.0
  70. XmlNSOpenSearch string `xml:"xmlns:opensearch,attr,omitempty"` // https://github.com/golang/go/issues/9519#issuecomment-252196382
  71. Query string `xml:"opensearch:Query,omitempty"` // http://www.opensearch.org/Specifications/OpenSearch/1.1#Example_of_OpenSearch_response_elements_in_Atom_1.0
  72. Title HumanText `xml:"title"`
  73. Subtitle *HumanText `xml:"subtitle,omitempty"`
  74. Id Id `xml:"id"`
  75. Updated iso8601 `xml:"updated"`
  76. Generator *Generator `xml:"generator,omitempty"`
  77. Icon Iri `xml:"icon,omitempty"`
  78. Logo Iri `xml:"logo,omitempty"`
  79. Links []Link `xml:"link"`
  80. Categories []Category `xml:"category"`
  81. Authors []Person `xml:"author"`
  82. Contributors []Person `xml:"contributor"`
  83. Rights *HumanText `xml:"rights,omitempty"`
  84. Entries []*Entry `xml:"entry"`
  85. }
  86. type Generator struct {
  87. Uri Iri `xml:"uri,attr"`
  88. Version string `xml:"version,attr,omitempty"`
  89. Body string `xml:",chardata"`
  90. }
  91. // http://stackoverflow.com/a/25015260
  92. type iso8601 time.Time
  93. func (v iso8601) IsZero() bool { return time.Time(v).IsZero() }
  94. func (a iso8601) After(b iso8601) bool { return time.Time(a).After(time.Time(b)) }
  95. func (a iso8601) Before(b iso8601) bool { return time.Time(a).Before(time.Time(b)) }
  96. func (a iso8601) Format(fmt string) string { return time.Time(a).Format(fmt) }
  97. func (v iso8601) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
  98. e.EncodeElement(v.Format(time.RFC3339), start)
  99. return nil
  100. }
  101. func (c *iso8601) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  102. var v string
  103. d.DecodeElement(&v, &start)
  104. if parse, err := time.Parse(time.RFC3339, v); err != nil {
  105. return err
  106. } else {
  107. *c = iso8601(parse)
  108. return nil
  109. }
  110. }
  111. // see also https://godoc.org/golang.org/x/tools/blog/atom#Link
  112. type Link struct {
  113. Href string `xml:"href,attr"`
  114. Rel Relation `xml:"rel,attr,omitempty"`
  115. Type MimeType `xml:"type,attr,omitempty"`
  116. HrefLang Lang `xml:"hreflang,attr,omitempty"`
  117. Title string `xml:"title,attr,omitempty"`
  118. Length int64 `xml:"length,attr,omitempty"`
  119. }
  120. // see also https://godoc.org/golang.org/x/tools/blog/atom#Person
  121. type Person struct {
  122. Name string `xml:"name"`
  123. Email string `xml:"email,omitempty"`
  124. Uri Iri `xml:"uri,omitempty"`
  125. }
  126. // see also https://godoc.org/golang.org/x/tools/blog/atom#Entry
  127. type Entry struct {
  128. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom entry,omitempty"`
  129. XmlBase Iri `xml:"xml:base,attr,omitempty"`
  130. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  131. Title HumanText `xml:"title"`
  132. Summary *HumanText `xml:"summary,omitempty"`
  133. Id Id `xml:"id"`
  134. Updated iso8601 `xml:"updated"`
  135. Published iso8601 `xml:"published,omitempty"`
  136. Links []Link `xml:"link"`
  137. Categories []Category `xml:"category"`
  138. Authors []Person `xml:"author"`
  139. Contributors []Person `xml:"contributor"`
  140. Content *HumanText `xml:"content"`
  141. // Vorsicht! beim Schreiben (Marshal/Encode) fuchst's noch: https://github.com/golang/go/issues/9519#issuecomment-252196382
  142. MediaThumbnail *MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail,omitempty"`
  143. GeoRssPoint *GeoRssPoint `xml:"http://www.georss.org/georss point,omitempty"`
  144. }
  145. type HumanText struct {
  146. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  147. Body string `xml:",chardata"`
  148. Type TextType `xml:"type,attr,omitempty"`
  149. Src Iri `xml:"src,attr,omitempty"`
  150. }
  151. type Category struct {
  152. Term string `xml:"term,attr"`
  153. Scheme Iri `xml:"scheme,attr,omitempty"`
  154. Label string `xml:"label,attr,omitempty"`
  155. }
  156. type MediaThumbnail struct {
  157. Url Iri `xml:"url,attr"`
  158. }
  159. type Latitude float32
  160. type Longitude float32
  161. type GeoRssPoint struct {
  162. Lat Latitude
  163. Lon Longitude
  164. }
  165. func (v GeoRssPoint) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
  166. e.EncodeElement(fmt.Sprintf("%f %f", v.Lat, v.Lon), start)
  167. return nil
  168. }
  169. func (c *GeoRssPoint) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  170. var v string
  171. d.DecodeElement(&v, &start)
  172. res := strings.SplitN(v, " ", 2)
  173. if len(res) != 2 {
  174. return errors.New("Not a proper 'lat lon' pair.")
  175. }
  176. lat, err := strconv.ParseFloat(res[0], 32)
  177. if err != nil {
  178. return err
  179. }
  180. lon, err := strconv.ParseFloat(res[1], 32)
  181. if err != nil {
  182. return err
  183. }
  184. *c = GeoRssPoint{Lat: Latitude(lat), Lon: Longitude(lon)}
  185. return nil
  186. }
  187. func xmlEncodeWithXslt(e interface{}, hrefXslt string, enc *xml.Encoder) error {
  188. var err error
  189. // preamble
  190. if err = enc.EncodeToken(xml.ProcInst{Target: "xml", Inst: []byte(`version="1.0" encoding="UTF-8"`)}); err == nil {
  191. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  192. if err = enc.EncodeToken(xml.ProcInst{Target: "xml-stylesheet", Inst: []byte("type='text/xsl' href='" + hrefXslt + "'")}); err == nil {
  193. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  194. if err = enc.EncodeToken(xml.Comment(lengthyAtomPreambleComment)); err == nil {
  195. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  196. if err = enc.Encode(e); err == nil {
  197. err = enc.EncodeToken(xml.CharData("\n"))
  198. }
  199. }
  200. }
  201. }
  202. }
  203. }
  204. }
  205. return err
  206. }
  207. func (feed *Feed) Append(e *Entry) (*Entry, error) {
  208. if err := e.Validate(); err != nil {
  209. return nil, err
  210. }
  211. // todo: pre-check uniqueness of Id
  212. feed.Entries = append(feed.Entries, e)
  213. return e, nil
  214. }
  215. // sort.Interface
  216. type ByPublishedDesc []*Entry
  217. func (a ByPublishedDesc) Len() int { return len(a) }
  218. func (a ByPublishedDesc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  219. func (a ByPublishedDesc) Less(i, j int) bool { return !a[i].Published.Before(a[j].Published) }
  220. type ByUpdatedDesc []*Entry
  221. func (a ByUpdatedDesc) Len() int { return len(a) }
  222. func (a ByUpdatedDesc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  223. func (a ByUpdatedDesc) Less(i, j int) bool { return !a[i].Updated.Before(a[j].Updated) }
  224. // custom interface
  225. // sufficient for 32 bit.
  226. func base64ToBase24x7(b64 string) (string, error) {
  227. if data, err := base64.RawURLEncoding.DecodeString(b64); err != nil {
  228. return "", err
  229. } else {
  230. // check len(data) ?
  231. ui32 := binary.LittleEndian.Uint32(data)
  232. base24 := fmt.Sprintf("%07s", strconv.FormatUint(uint64(ui32), 24))
  233. return strings.Map(mapBase24ToSuperCareful, base24), nil
  234. }
  235. }
  236. // Being "super-careful" https://code.mro.name/mro/ProgrammableWebSwartz2013/src/master/content/pages/2-building-for-users.md
  237. //
  238. // 0123456789abcdefghijklmn ->
  239. // 23456789abcdefghkrstuxyz
  240. func mapBase24ToSuperCareful(r rune) rune {
  241. digits := []rune("23456789abcdefghkrstuxyz")
  242. switch {
  243. case '0' <= r && r <= '9':
  244. return digits[:10][r-'0']
  245. case r >= 'a' && r <= 'n':
  246. return digits[10:][r-'a']
  247. }
  248. panic("ouch")
  249. }
  250. func newRandomId(t time.Time) Id {
  251. ui32 := uint32(t.Unix() & 0xFFFFFFFF) // unix time in seconds as uint32
  252. base24 := fmt.Sprintf("%07s", strconv.FormatUint(uint64(ui32), 24))
  253. return Id(strings.Map(mapBase24ToSuperCareful, base24))
  254. }
  255. func (feed Feed) newUniqueId(t time.Time) Id {
  256. id := newRandomId(t)
  257. for _, entry := range feed.Entries {
  258. if entry.Id == id {
  259. panic("id not unique")
  260. }
  261. }
  262. return id
  263. }
  264. func (feed Feed) newEntry(t time.Time) *Entry {
  265. defer un(trace("Feed.newEntry(t)"))
  266. return &Entry{
  267. Authors: feed.Authors,
  268. Published: iso8601(t),
  269. Id: feed.newUniqueId(t),
  270. }
  271. }
  272. func (feed *Feed) findEntry(doesMatch func(*Entry) bool) (int, *Entry) {
  273. defer un(trace(strings.Join([]string{"Feed.findEntry(f(*Entry))"}, "")))
  274. if nil != doesMatch {
  275. for idx, entry := range feed.Entries {
  276. if doesMatch(entry) {
  277. return idx, entry
  278. }
  279. }
  280. }
  281. return -1, nil
  282. }
  283. func (feed *Feed) findEntryById(id Id) (int, *Entry) {
  284. defer un(trace(strings.Join([]string{"Feed.findEntryById('", string(id), "')"}, "")))
  285. if "" != id {
  286. return feed.findEntry(func(entry *Entry) bool { return id == entry.Id })
  287. }
  288. return feed.findEntry(nil)
  289. }
  290. func (feed *Feed) deleteEntryById(id Id) *Entry {
  291. if i, entry := feed.findEntryById(id); i < 0 {
  292. return nil
  293. } else {
  294. a := feed.Entries
  295. // https://github.com/golang/go/wiki/SliceTricks
  296. copy(a[i:], a[i+1:])
  297. // a[len(a)-1] = nil // or the zero value of T
  298. feed.Entries = a[:len(a)-1]
  299. feed.Updated = iso8601(time.Now())
  300. // don' try to be smart. When removing days feeds, we rely on correct Published date.
  301. // entry.Published = iso8601{time.Time{}}
  302. // entry.Updated = entry.Published
  303. return entry
  304. }
  305. }
  306. func (feed Feed) SaveToFile(dst string) error {
  307. defer un(trace("Feed.SaveToFile"))
  308. sort.Sort(ByPublishedDesc(feed.Entries))
  309. // remove deleted entries? Maybe Published date zero.
  310. tmp := dst + "~"
  311. var err error
  312. var w *os.File
  313. if w, err = os.Create(tmp); err == nil {
  314. enc := xml.NewEncoder(w)
  315. enc.Indent("", " ")
  316. if err = enc.Encode(feed); err == nil {
  317. if err = enc.Flush(); err == nil {
  318. if err = w.Close(); err == nil {
  319. if err = os.Rename(dst, dst+".bak"); err == nil || os.IsNotExist(err) {
  320. if err = os.Rename(tmp, dst); err == nil {
  321. return nil
  322. }
  323. }
  324. }
  325. }
  326. }
  327. }
  328. return err
  329. }
  330. // Validate for storage
  331. func (entry *Entry) Validate() error {
  332. if "" == entry.Id {
  333. return errors.New("Entry may not have empty Id.")
  334. }
  335. if 1 < len(entry.Links) {
  336. return fmt.Errorf("Entry may not have more than one link. Entry.Id='%s'", entry.Id)
  337. }
  338. if 1 == len(entry.Links) {
  339. if "" == entry.Links[0].Href {
  340. return fmt.Errorf("Entry may not have empty link. Entry.Id='%s'", entry.Id)
  341. }
  342. url := mustParseURL(entry.Links[0].Href)
  343. if !url.IsAbs() {
  344. return fmt.Errorf("Entry must have absolute Link. Entry.Id='%s'", entry.Id)
  345. }
  346. if "" == url.Host {
  347. return fmt.Errorf("Entry must have Link with non-empty host. Entry.Id='%s'", entry.Id)
  348. }
  349. }
  350. return nil
  351. }
  352. func AggregateCategories(entries []*Entry) []Category {
  353. // aggregate & count feed entry categories
  354. cats := make(map[string]int, 1*len(entries)) // raw len guess
  355. for _, ent := range entries {
  356. for _, cat := range ent.Categories {
  357. cats[cat.Term] += 1
  358. }
  359. }
  360. cs := make([]Category, 0, len(cats))
  361. for term, count := range cats {
  362. if term != "" && count != 0 {
  363. cs = append(cs, Category{Term: term, Label: strconv.Itoa(count)})
  364. }
  365. }
  366. sort.Slice(cs, func(i, j int) bool {
  367. return strings.Compare(cs[i].Term, cs[j].Term) < 0
  368. })
  369. return cs
  370. }
  371. func (ht HumanText) Categories() []Category {
  372. ret := make([]Category, 0, 10)
  373. for _, t := range tagsFromString(ht.Body) {
  374. ret = append(ret, Category{Term: t})
  375. }
  376. return ret
  377. }
  378. const iWillBeALineFeedMarker = "+,zX@D4X#%`lGdX-vWU?/==v"
  379. func cleanLegacyContent(txt string) string {
  380. src := strings.Replace(txt, "<br />", iWillBeALineFeedMarker, -1)
  381. if node, err := html.Parse(strings.NewReader(src)); err == nil {
  382. str := strings.Replace(scrape.Text(node), iWillBeALineFeedMarker, "", -1)
  383. return strings.Trim(str[:len(str)-len("( Permalink )")], " ")
  384. } else {
  385. return err.Error()
  386. }
  387. }