atom.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. //
  2. // Copyright (C) 2017-2018 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. //
  17. package main
  18. import (
  19. "bufio"
  20. "encoding/base64"
  21. "encoding/binary"
  22. "encoding/xml"
  23. "errors"
  24. "fmt"
  25. "io"
  26. "os"
  27. "sort"
  28. "strconv"
  29. "strings"
  30. "time"
  31. "unicode"
  32. // "golang.org/x/tools/blog/atom"
  33. "github.com/yhat/scrape"
  34. "golang.org/x/net/html"
  35. )
  36. const lengthyAtomPreambleComment string = `
  37. https://developer.mozilla.org/en/docs/XSL_Transformations_in_Mozilla_FAQ#Why_isn.27t_my_stylesheet_applied.3F
  38. Caution! Firefox ignores your XSLT stylesheet if your XML looks like an RSS or Atom feed. A typical workaround is to insert an XML comment at the beginning of your XML file to move the <fEEd or <rsS tag out of the first 512 bytes used by Firefox to guess whether it is a feed or not.
  39. See also the discussion at https://bugzilla.mozilla.org/show_bug.cgi?id=338621#c72.
  40. For best results, serve both atom feed and xslt as 'text/xml' or 'application/xml' without charset specified.
  41. `
  42. const atomNamespace = "http://www.w3.org/2005/Atom"
  43. var emojiRunes map[rune]struct{}
  44. func init() {
  45. emojiRunes = make(map[rune]struct{}, len(emojiCodeMap))
  46. for _, v := range emojiCodeMap {
  47. r := []rune(v)[0]
  48. emojiRunes[r] = struct{}{}
  49. }
  50. emojiCodeMap = nil
  51. }
  52. func FeedFromFileName(file string) (Feed, error) {
  53. if read, err := os.Open(file); nil == read || nil != err {
  54. return Feed{}, err
  55. } else {
  56. defer read.Close()
  57. return FeedFromReader(read)
  58. }
  59. }
  60. func FeedFromReader(file io.Reader) (Feed, error) {
  61. ret := Feed{}
  62. err := xml.NewDecoder(file).Decode(&ret)
  63. return ret, err
  64. }
  65. // http://atomenabled.org/developers/syndication/
  66. //
  67. // see also https://godoc.org/golang.org/x/tools/blog/atom#Feed
  68. type Feed struct {
  69. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
  70. XmlBase string `xml:"xml:base,attr,omitempty"`
  71. XmlLang string `xml:"xml:lang,attr,omitempty"`
  72. XmlNSShaarliGo string `xml:"xmlns:sg,attr,omitempty"` // https://github.com/golang/go/issues/9519#issuecomment-252196382
  73. SearchTerms string `xml:"sg:searchTerms,attr,omitempty"` // rather use http://www.opensearch.org/Specifications/OpenSearch/1.1#Example_of_OpenSearch_response_elements_in_Atom_1.0
  74. XmlNSOpenSearch string `xml:"xmlns:opensearch,attr,omitempty"` // https://github.com/golang/go/issues/9519#issuecomment-252196382
  75. Query string `xml:"opensearch:Query,omitempty"` // http://www.opensearch.org/Specifications/OpenSearch/1.1#Example_of_OpenSearch_response_elements_in_Atom_1.0
  76. Title HumanText `xml:"title"`
  77. Subtitle *HumanText `xml:"subtitle,omitempty"`
  78. Id string `xml:"id"`
  79. Updated iso8601 `xml:"updated"`
  80. Generator *Generator `xml:"generator,omitempty"`
  81. Icon string `xml:"icon,omitempty"`
  82. Logo string `xml:"logo,omitempty"`
  83. Links []Link `xml:"link"`
  84. Categories []Category `xml:"category"`
  85. Authors []Person `xml:"author"`
  86. Contributors []Person `xml:"contributor"`
  87. Rights *HumanText `xml:"rights,omitempty"`
  88. Entries []*Entry `xml:"entry"`
  89. }
  90. type Generator struct {
  91. Uri string `xml:"uri,attr"`
  92. Version string `xml:"version,attr,omitempty"`
  93. Body string `xml:",chardata"`
  94. }
  95. // http://stackoverflow.com/a/25015260
  96. type iso8601 time.Time
  97. func (v iso8601) IsZero() bool { return time.Time(v).IsZero() }
  98. func (a iso8601) After(b iso8601) bool { return time.Time(a).After(time.Time(b)) }
  99. func (a iso8601) Before(b iso8601) bool { return time.Time(a).Before(time.Time(b)) }
  100. func (a iso8601) Format(fmt string) string { return time.Time(a).Format(fmt) }
  101. func (v iso8601) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
  102. e.EncodeElement(v.Format(time.RFC3339), start)
  103. return nil
  104. }
  105. func (c *iso8601) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  106. var v string
  107. d.DecodeElement(&v, &start)
  108. if parse, err := time.Parse(time.RFC3339, v); err != nil {
  109. return err
  110. } else {
  111. *c = iso8601(parse)
  112. return nil
  113. }
  114. }
  115. // see also https://godoc.org/golang.org/x/tools/blog/atom#Link
  116. type Link struct {
  117. Href string `xml:"href,attr"`
  118. Rel string `xml:"rel,attr,omitempty"`
  119. Type string `xml:"type,attr,omitempty"`
  120. HrefLang string `xml:"hreflang,attr,omitempty"`
  121. Title string `xml:"title,attr,omitempty"`
  122. Length int64 `xml:"length,attr,omitempty"`
  123. }
  124. // see also https://godoc.org/golang.org/x/tools/blog/atom#Person
  125. type Person struct {
  126. Name string `xml:"name"`
  127. Email string `xml:"email,omitempty"`
  128. Uri string `xml:"uri,omitempty"`
  129. }
  130. // see also https://godoc.org/golang.org/x/tools/blog/atom#Entry
  131. type Entry struct {
  132. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom entry,omitempty"`
  133. XmlBase string `xml:"xml:base,attr,omitempty"`
  134. XmlLang string `xml:"xml:lang,attr,omitempty"`
  135. Title HumanText `xml:"title"`
  136. Summary *HumanText `xml:"summary,omitempty"`
  137. Id string `xml:"id"`
  138. Updated iso8601 `xml:"updated"`
  139. Published iso8601 `xml:"published,omitempty"`
  140. Links []Link `xml:"link"`
  141. Categories []Category `xml:"category"`
  142. Authors []Person `xml:"author"`
  143. Contributors []Person `xml:"contributor"`
  144. Content *HumanText `xml:"content"`
  145. // Vorsicht! beim Schreiben (Marshal/Encode) fuchst's noch: https://github.com/golang/go/issues/9519#issuecomment-252196382
  146. MediaThumbnail *MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail,omitempty"`
  147. GeoRssPoint *GeoRssPoint `xml:"http://www.georss.org/georss point,omitempty"`
  148. }
  149. type HumanText struct {
  150. XmlLang string `xml:"xml:lang,attr,omitempty"`
  151. Body string `xml:",chardata"`
  152. Type string `xml:"type,attr,omitempty"`
  153. Src string `xml:"src,attr,omitempty"`
  154. }
  155. type Category struct {
  156. Term string `xml:"term,attr"`
  157. Scheme string `xml:"scheme,attr,omitempty"`
  158. Label string `xml:"label,attr,omitempty"`
  159. }
  160. type MediaThumbnail struct {
  161. Url string `xml:"url,attr"`
  162. }
  163. type GeoRssPoint struct {
  164. Lat, Lon float32
  165. }
  166. func (v GeoRssPoint) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
  167. e.EncodeElement(fmt.Sprintf("%f %f", v.Lat, v.Lon), start)
  168. return nil
  169. }
  170. func (c *GeoRssPoint) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  171. var v string
  172. d.DecodeElement(&v, &start)
  173. res := strings.SplitN(v, " ", 2)
  174. if len(res) != 2 {
  175. return errors.New("Not a proper 'lat lon' pair.")
  176. }
  177. lat, err := strconv.ParseFloat(res[0], 32)
  178. if err != nil {
  179. return err
  180. }
  181. lon, err := strconv.ParseFloat(res[1], 32)
  182. if err != nil {
  183. return err
  184. }
  185. *c = GeoRssPoint{Lat: float32(lat), Lon: float32(lon)}
  186. return nil
  187. }
  188. func xmlEncodeWithXslt(e interface{}, hrefXslt string, enc *xml.Encoder) error {
  189. var err error
  190. // preamble
  191. if err = enc.EncodeToken(xml.ProcInst{Target: "xml", Inst: []byte(`version="1.0" encoding="UTF-8"`)}); err == nil {
  192. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  193. if err = enc.EncodeToken(xml.ProcInst{Target: "xml-stylesheet", Inst: []byte("type='text/xsl' href='" + hrefXslt + "'")}); err == nil {
  194. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  195. if err = enc.EncodeToken(xml.Comment(lengthyAtomPreambleComment)); err == nil {
  196. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  197. if err = enc.Encode(e); err == nil {
  198. err = enc.EncodeToken(xml.CharData("\n"))
  199. }
  200. }
  201. }
  202. }
  203. }
  204. }
  205. }
  206. return err
  207. }
  208. func (feed *Feed) Append(e *Entry) (*Entry, error) {
  209. if err := e.Validate(); err != nil {
  210. return nil, err
  211. }
  212. // todo: pre-check uniqueness of Id
  213. feed.Entries = append(feed.Entries, e)
  214. return e, nil
  215. }
  216. // sort.Interface
  217. type ByPublishedDesc []*Entry
  218. func (a ByPublishedDesc) Len() int { return len(a) }
  219. func (a ByPublishedDesc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  220. func (a ByPublishedDesc) Less(i, j int) bool { return !a[i].Published.Before(a[j].Published) }
  221. type ByUpdatedDesc []*Entry
  222. func (a ByUpdatedDesc) Len() int { return len(a) }
  223. func (a ByUpdatedDesc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  224. func (a ByUpdatedDesc) Less(i, j int) bool { return !a[i].Updated.Before(a[j].Updated) }
  225. // custom interface
  226. // sufficient for 32 bit.
  227. func base64ToBase24x7(b64 string) (string, error) {
  228. if data, err := base64.RawURLEncoding.DecodeString(b64); err != nil {
  229. return "", err
  230. } else {
  231. // check len(data) ?
  232. ui32 := binary.LittleEndian.Uint32(data)
  233. base24 := fmt.Sprintf("%07s", strconv.FormatUint(uint64(ui32), 24))
  234. return strings.Map(mapBase24ToSuperCareful, base24), nil
  235. }
  236. }
  237. // Being "super-careful" https://code.mro.name/mro/ProgrammableWebSwartz2013/src/master/content/pages/2-building-for-users.md
  238. //
  239. // 0123456789abcdefghijklmn ->
  240. // 23456789abcdefghkrstuxyz
  241. func mapBase24ToSuperCareful(r rune) rune {
  242. digits := []rune("23456789abcdefghkrstuxyz")
  243. switch {
  244. case '0' <= r && r <= '9':
  245. return digits[:10][r-'0']
  246. case r >= 'a' && r <= 'n':
  247. return digits[10:][r-'a']
  248. }
  249. panic("ouch")
  250. }
  251. func newRandomId(t time.Time) string {
  252. ui32 := uint32(t.Unix() & 0xFFFFFFFF) // unix time in seconds as uint32
  253. base24 := fmt.Sprintf("%07s", strconv.FormatUint(uint64(ui32), 24))
  254. return strings.Map(mapBase24ToSuperCareful, base24)
  255. }
  256. func (feed Feed) newUniqueId(t time.Time) string {
  257. id := newRandomId(t)
  258. for _, entry := range feed.Entries {
  259. if entry.Id == id {
  260. panic("id not unique")
  261. }
  262. }
  263. return id
  264. }
  265. func (feed Feed) newEntry(t time.Time) *Entry {
  266. defer un(trace("Feed.newEntry(t)"))
  267. return &Entry{
  268. Authors: feed.Authors,
  269. Published: iso8601(t),
  270. Id: feed.newUniqueId(t),
  271. }
  272. }
  273. func (feed *Feed) findEntry(doesMatch func(*Entry) bool) (int, *Entry) {
  274. defer un(trace(strings.Join([]string{"Feed.findEntry(f(*Entry))"}, "")))
  275. if nil != doesMatch {
  276. for idx, entry := range feed.Entries {
  277. if doesMatch(entry) {
  278. return idx, entry
  279. }
  280. }
  281. }
  282. return -1, nil
  283. }
  284. func (feed *Feed) findEntryById(id string) (int, *Entry) {
  285. defer un(trace(strings.Join([]string{"Feed.findEntryById('", id, "')"}, "")))
  286. if "" != id {
  287. return feed.findEntry(func(entry *Entry) bool { return id == entry.Id })
  288. }
  289. return feed.findEntry(nil)
  290. }
  291. func (feed *Feed) deleteEntry(id string) *Entry {
  292. if i, entry := feed.findEntryById(id); i >= 0 {
  293. a := feed.Entries
  294. // https://github.com/golang/go/wiki/SliceTricks
  295. copy(a[i:], a[i+1:])
  296. a[len(a)-1] = nil // or the zero value of T
  297. feed.Entries = a[:len(a)-1]
  298. // don' try to be smart. When removing days feeds, we rely on correct Published date.
  299. // entry.Published = iso8601{time.Time{}}
  300. // entry.Updated = entry.Published
  301. return entry
  302. }
  303. return nil
  304. }
  305. func (feed Feed) SaveToFile(dst string) error {
  306. defer un(trace("Feed.SaveToFile"))
  307. sort.Sort(ByPublishedDesc(feed.Entries))
  308. // remove deleted entries? Maybe Published date zero.
  309. tmp := dst + "~"
  310. var err error
  311. var w *os.File
  312. if w, err = os.Create(tmp); err == nil {
  313. enc := xml.NewEncoder(w)
  314. enc.Indent("", " ")
  315. if err = enc.Encode(feed); err == nil {
  316. if err = enc.Flush(); err == nil {
  317. if err = w.Close(); err == nil {
  318. if err = os.Rename(dst, dst+".bak"); err == nil || os.IsNotExist(err) {
  319. if err = os.Rename(tmp, dst); err == nil {
  320. return nil
  321. }
  322. }
  323. }
  324. }
  325. }
  326. }
  327. return err
  328. }
  329. // Validate for storage
  330. func (entry *Entry) Validate() error {
  331. if "" == entry.Id {
  332. return errors.New("Entry may not have empty Id.")
  333. }
  334. if 1 < len(entry.Links) {
  335. return fmt.Errorf("Entry may not have more than one link. Entry.Id='%s'", entry.Id)
  336. }
  337. if 1 == len(entry.Links) {
  338. if "" == entry.Links[0].Href {
  339. return fmt.Errorf("Entry may not have empty link. Entry.Id='%s'", entry.Id)
  340. }
  341. url := mustParseURL(entry.Links[0].Href)
  342. if !url.IsAbs() {
  343. return fmt.Errorf("Entry must have absolute Link. Entry.Id='%s'", entry.Id)
  344. }
  345. if "" == url.Host {
  346. return fmt.Errorf("Entry must have Link with non-empty host. Entry.Id='%s'", entry.Id)
  347. }
  348. }
  349. return nil
  350. }
  351. func (entry Entry) CategoriesMerged() []Category {
  352. a := entry.Title.Categories()
  353. b := entry.Content.Categories()
  354. ret := make([]Category, 0, len(a)+len(b)+len(entry.Categories))
  355. ret = append(ret, a...)
  356. ret = append(ret, b...)
  357. ret = append(ret, entry.Categories...)
  358. sort.Slice(ret, func(i, j int) bool { return strings.Compare(ret[i].Term, ret[j].Term) < 0 })
  359. return uniqCategory(ret)
  360. }
  361. func AggregateCategories(entries []*Entry) []Category {
  362. // aggregate & count feed entry categories
  363. cats := make(map[string]int, 1*len(entries)) // raw len guess
  364. for _, ent := range entries {
  365. for _, cat := range ent.Categories {
  366. cats[cat.Term] += 1
  367. }
  368. }
  369. cs := make([]Category, 0, len(cats))
  370. for term, count := range cats {
  371. if term != "" && count != 0 {
  372. cs = append(cs, Category{Term: term, Label: strconv.Itoa(count)})
  373. }
  374. }
  375. sort.Slice(cs, func(i, j int) bool {
  376. return strings.Compare(cs[i].Term, cs[j].Term) < 0
  377. })
  378. return cs
  379. }
  380. func uniqCategory(data []Category) []Category {
  381. ret := make([]Category, 0, len(data))
  382. for i, e := range data {
  383. if "" == e.Term {
  384. continue
  385. }
  386. if i == 0 || e.Term != data[i-1].Term {
  387. ret = append(ret, e)
  388. }
  389. }
  390. return ret
  391. }
  392. func (ht HumanText) Categories() []Category {
  393. ret := make([]Category, 0, 10)
  394. for t, _ := range tagsFromString(ht.Body) {
  395. ret = append(ret, Category{Term: t})
  396. }
  397. return ret
  398. }
  399. func isTag(s string) bool {
  400. for _, r := range s {
  401. if '#' == r {
  402. return true
  403. }
  404. _, ok := emojiRunes[r]
  405. return ok
  406. }
  407. return false
  408. }
  409. func tagsFromString(str string) map[string]struct{} {
  410. scanner := bufio.NewScanner(strings.NewReader(str))
  411. scanner.Split(bufio.ScanWords)
  412. ret := make(map[string]struct{}, 10)
  413. for scanner.Scan() {
  414. term := scanner.Text()
  415. if !isTag(term) {
  416. continue
  417. }
  418. term = strings.TrimLeft(term, "#")
  419. term = strings.TrimRightFunc(term, func(r rune) bool {
  420. return !('§' == r || '†' == r) && unicode.IsPunct(r)
  421. })
  422. if "" != term {
  423. ret[term] = struct{}{}
  424. }
  425. }
  426. return ret
  427. }
  428. const iWillBeALineFeedMarker = "+,zX@D4X#%`lGdX-vWU?/==v"
  429. func cleanLegacyContent(txt string) string {
  430. src := strings.Replace(txt, "<br />", iWillBeALineFeedMarker, -1)
  431. if node, err := html.Parse(strings.NewReader(src)); err == nil {
  432. str := strings.Replace(scrape.Text(node), iWillBeALineFeedMarker, "", -1)
  433. return strings.Trim(str[:len(str)-len("( Permalink )")], " ")
  434. } else {
  435. return err.Error()
  436. }
  437. }