atom.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. //
  2. // Copyright (C) 2017-2019 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. //
  17. package main
  18. import (
  19. "bufio"
  20. "encoding/base64"
  21. "encoding/binary"
  22. "encoding/xml"
  23. "errors"
  24. "fmt"
  25. "io"
  26. "os"
  27. "sort"
  28. "strconv"
  29. "strings"
  30. "time"
  31. "unicode"
  32. // "golang.org/x/tools/blog/atom"
  33. "github.com/yhat/scrape"
  34. "golang.org/x/net/html"
  35. )
  36. const lengthyAtomPreambleComment string = `
  37. https://developer.mozilla.org/en/docs/XSL_Transformations_in_Mozilla_FAQ#Why_isn.27t_my_stylesheet_applied.3F
  38. Caution! Firefox ignores your XSLT stylesheet if your XML looks like a RSS or Atom feed. A typical workaround is to insert an XML comment at the beginning of your XML file to move the <fEEd or <rsS tag out of the first 512 bytes used by Firefox to guess whether it is a feed or not.
  39. See also the discussion at https://bugzilla.mozilla.org/show_bug.cgi?id=338621#c72.
  40. For best results, serve both atom feed and xslt as 'text/xml' or 'application/xml' without charset specified.
  41. `
  42. const atomNamespace = "http://www.w3.org/2005/Atom"
  43. var emojiRunes map[rune]struct{}
  44. func init() {
  45. emojiRunes = make(map[rune]struct{}, len(emojiCodeMap))
  46. for _, v := range emojiCodeMap {
  47. r := []rune(v)[0]
  48. emojiRunes[r] = struct{}{}
  49. }
  50. emojiCodeMap = nil
  51. }
  52. func FeedFromFileName(file string) (Feed, error) {
  53. if read, err := os.Open(file); nil == read || nil != err {
  54. return Feed{}, err
  55. } else {
  56. defer read.Close()
  57. return FeedFromReader(read)
  58. }
  59. }
  60. func FeedFromReader(file io.Reader) (Feed, error) {
  61. ret := Feed{}
  62. err := xml.NewDecoder(file).Decode(&ret)
  63. return ret, err
  64. }
  65. type Iri string // https://tools.ietf.org/html/rfc3987
  66. type Id Iri // we allow relative Ids (in persistent store)
  67. type Lang string // https://tools.ietf.org/html/rfc3066
  68. type Relation string // https://www.iana.org/assignments/link-relations/link-relations.xhtml#link-relations-1
  69. type MimeType string // https://tools.ietf.org/html/rfc2045#section-5.1
  70. type TextType string // https://tools.ietf.org/html/rfc4287#section-4.1.3.1
  71. // https://mro.github.io/atomenabled.org/
  72. // https://tools.ietf.org/html/rfc4287#section-4.1.1
  73. //
  74. // see also https://godoc.org/golang.org/x/tools/blog/atom#Feed
  75. type Feed struct {
  76. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
  77. XmlBase Iri `xml:"xml:base,attr,omitempty"`
  78. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  79. XmlNSShaarliGo string `xml:"xmlns:sg,attr,omitempty"` // https://github.com/golang/go/issues/9519#issuecomment-252196382
  80. SearchTerms string `xml:"sg:searchTerms,attr,omitempty"` // rather use http://www.opensearch.org/Specifications/OpenSearch/1.1#Example_of_OpenSearch_response_elements_in_Atom_1.0
  81. XmlNSOpenSearch string `xml:"xmlns:opensearch,attr,omitempty"` // https://github.com/golang/go/issues/9519#issuecomment-252196382
  82. Query string `xml:"opensearch:Query,omitempty"` // http://www.opensearch.org/Specifications/OpenSearch/1.1#Example_of_OpenSearch_response_elements_in_Atom_1.0
  83. Title HumanText `xml:"title"`
  84. Subtitle *HumanText `xml:"subtitle,omitempty"`
  85. Id Id `xml:"id"`
  86. Updated iso8601 `xml:"updated"`
  87. Generator *Generator `xml:"generator,omitempty"`
  88. Icon Iri `xml:"icon,omitempty"`
  89. Logo Iri `xml:"logo,omitempty"`
  90. Links []Link `xml:"link"`
  91. Categories []Category `xml:"category"`
  92. Authors []Person `xml:"author"`
  93. Contributors []Person `xml:"contributor"`
  94. Rights *HumanText `xml:"rights,omitempty"`
  95. Entries []*Entry `xml:"entry"`
  96. }
  97. type Generator struct {
  98. Uri Iri `xml:"uri,attr"`
  99. Version string `xml:"version,attr,omitempty"`
  100. Body string `xml:",chardata"`
  101. }
  102. // http://stackoverflow.com/a/25015260
  103. type iso8601 time.Time
  104. func (v iso8601) IsZero() bool { return time.Time(v).IsZero() }
  105. func (a iso8601) After(b iso8601) bool { return time.Time(a).After(time.Time(b)) }
  106. func (a iso8601) Before(b iso8601) bool { return time.Time(a).Before(time.Time(b)) }
  107. func (a iso8601) Format(fmt string) string { return time.Time(a).Format(fmt) }
  108. func (v iso8601) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
  109. e.EncodeElement(v.Format(time.RFC3339), start)
  110. return nil
  111. }
  112. func (c *iso8601) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  113. var v string
  114. d.DecodeElement(&v, &start)
  115. if parse, err := time.Parse(time.RFC3339, v); err != nil {
  116. return err
  117. } else {
  118. *c = iso8601(parse)
  119. return nil
  120. }
  121. }
  122. // see also https://godoc.org/golang.org/x/tools/blog/atom#Link
  123. type Link struct {
  124. Href string `xml:"href,attr"`
  125. Rel Relation `xml:"rel,attr,omitempty"`
  126. Type MimeType `xml:"type,attr,omitempty"`
  127. HrefLang Lang `xml:"hreflang,attr,omitempty"`
  128. Title string `xml:"title,attr,omitempty"`
  129. Length int64 `xml:"length,attr,omitempty"`
  130. }
  131. // see also https://godoc.org/golang.org/x/tools/blog/atom#Person
  132. type Person struct {
  133. Name string `xml:"name"`
  134. Email string `xml:"email,omitempty"`
  135. Uri Iri `xml:"uri,omitempty"`
  136. }
  137. // see also https://godoc.org/golang.org/x/tools/blog/atom#Entry
  138. type Entry struct {
  139. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom entry,omitempty"`
  140. XmlBase Iri `xml:"xml:base,attr,omitempty"`
  141. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  142. Title HumanText `xml:"title"`
  143. Summary *HumanText `xml:"summary,omitempty"`
  144. Id Id `xml:"id"`
  145. Updated iso8601 `xml:"updated"`
  146. Published iso8601 `xml:"published,omitempty"`
  147. Links []Link `xml:"link"`
  148. Categories []Category `xml:"category"`
  149. Authors []Person `xml:"author"`
  150. Contributors []Person `xml:"contributor"`
  151. Content *HumanText `xml:"content"`
  152. // Vorsicht! beim Schreiben (Marshal/Encode) fuchst's noch: https://github.com/golang/go/issues/9519#issuecomment-252196382
  153. MediaThumbnail *MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail,omitempty"`
  154. GeoRssPoint *GeoRssPoint `xml:"http://www.georss.org/georss point,omitempty"`
  155. }
  156. type HumanText struct {
  157. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  158. Body string `xml:",chardata"`
  159. Type TextType `xml:"type,attr,omitempty"`
  160. Src Iri `xml:"src,attr,omitempty"`
  161. }
  162. type Category struct {
  163. Term string `xml:"term,attr"`
  164. Scheme Iri `xml:"scheme,attr,omitempty"`
  165. Label string `xml:"label,attr,omitempty"`
  166. }
  167. type MediaThumbnail struct {
  168. Url Iri `xml:"url,attr"`
  169. }
  170. type Latitude float32
  171. type Longitude float32
  172. type GeoRssPoint struct {
  173. Lat Latitude
  174. Lon Longitude
  175. }
  176. func (v GeoRssPoint) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
  177. e.EncodeElement(fmt.Sprintf("%f %f", v.Lat, v.Lon), start)
  178. return nil
  179. }
  180. func (c *GeoRssPoint) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  181. var v string
  182. d.DecodeElement(&v, &start)
  183. res := strings.SplitN(v, " ", 2)
  184. if len(res) != 2 {
  185. return errors.New("Not a proper 'lat lon' pair.")
  186. }
  187. lat, err := strconv.ParseFloat(res[0], 32)
  188. if err != nil {
  189. return err
  190. }
  191. lon, err := strconv.ParseFloat(res[1], 32)
  192. if err != nil {
  193. return err
  194. }
  195. *c = GeoRssPoint{Lat: Latitude(lat), Lon: Longitude(lon)}
  196. return nil
  197. }
  198. func xmlEncodeWithXslt(e interface{}, hrefXslt string, enc *xml.Encoder) error {
  199. var err error
  200. // preamble
  201. if err = enc.EncodeToken(xml.ProcInst{Target: "xml", Inst: []byte(`version="1.0" encoding="UTF-8"`)}); err == nil {
  202. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  203. if err = enc.EncodeToken(xml.ProcInst{Target: "xml-stylesheet", Inst: []byte("type='text/xsl' href='" + hrefXslt + "'")}); err == nil {
  204. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  205. if err = enc.EncodeToken(xml.Comment(lengthyAtomPreambleComment)); err == nil {
  206. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  207. if err = enc.Encode(e); err == nil {
  208. err = enc.EncodeToken(xml.CharData("\n"))
  209. }
  210. }
  211. }
  212. }
  213. }
  214. }
  215. }
  216. return err
  217. }
  218. func (feed *Feed) Append(e *Entry) (*Entry, error) {
  219. if err := e.Validate(); err != nil {
  220. return nil, err
  221. }
  222. // todo: pre-check uniqueness of Id
  223. feed.Entries = append(feed.Entries, e)
  224. return e, nil
  225. }
  226. // sort.Interface
  227. type ByPublishedDesc []*Entry
  228. func (a ByPublishedDesc) Len() int { return len(a) }
  229. func (a ByPublishedDesc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  230. func (a ByPublishedDesc) Less(i, j int) bool { return !a[i].Published.Before(a[j].Published) }
  231. type ByUpdatedDesc []*Entry
  232. func (a ByUpdatedDesc) Len() int { return len(a) }
  233. func (a ByUpdatedDesc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  234. func (a ByUpdatedDesc) Less(i, j int) bool { return !a[i].Updated.Before(a[j].Updated) }
  235. // custom interface
  236. // sufficient for 32 bit.
  237. func base64ToBase24x7(b64 string) (string, error) {
  238. if data, err := base64.RawURLEncoding.DecodeString(b64); err != nil {
  239. return "", err
  240. } else {
  241. // check len(data) ?
  242. ui32 := binary.LittleEndian.Uint32(data)
  243. base24 := fmt.Sprintf("%07s", strconv.FormatUint(uint64(ui32), 24))
  244. return strings.Map(mapBase24ToSuperCareful, base24), nil
  245. }
  246. }
  247. // Being "super-careful" https://code.mro.name/mro/ProgrammableWebSwartz2013/src/master/content/pages/2-building-for-users.md
  248. //
  249. // 0123456789abcdefghijklmn ->
  250. // 23456789abcdefghkrstuxyz
  251. func mapBase24ToSuperCareful(r rune) rune {
  252. digits := []rune("23456789abcdefghkrstuxyz")
  253. switch {
  254. case '0' <= r && r <= '9':
  255. return digits[:10][r-'0']
  256. case r >= 'a' && r <= 'n':
  257. return digits[10:][r-'a']
  258. }
  259. panic("ouch")
  260. }
  261. func newRandomId(t time.Time) Id {
  262. ui32 := uint32(t.Unix() & 0xFFFFFFFF) // unix time in seconds as uint32
  263. base24 := fmt.Sprintf("%07s", strconv.FormatUint(uint64(ui32), 24))
  264. return Id(strings.Map(mapBase24ToSuperCareful, base24))
  265. }
  266. func (feed Feed) newUniqueId(t time.Time) Id {
  267. id := newRandomId(t)
  268. for _, entry := range feed.Entries {
  269. if entry.Id == id {
  270. panic("id not unique")
  271. }
  272. }
  273. return id
  274. }
  275. func (feed Feed) newEntry(t time.Time) *Entry {
  276. defer un(trace("Feed.newEntry(t)"))
  277. return &Entry{
  278. Authors: feed.Authors,
  279. Published: iso8601(t),
  280. Id: feed.newUniqueId(t),
  281. }
  282. }
  283. func (feed *Feed) findEntry(doesMatch func(*Entry) bool) (int, *Entry) {
  284. defer un(trace(strings.Join([]string{"Feed.findEntry(f(*Entry))"}, "")))
  285. if nil != doesMatch {
  286. for idx, entry := range feed.Entries {
  287. if doesMatch(entry) {
  288. return idx, entry
  289. }
  290. }
  291. }
  292. return -1, nil
  293. }
  294. func (feed *Feed) findEntryById(id Id) (int, *Entry) {
  295. defer un(trace(strings.Join([]string{"Feed.findEntryById('", string(id), "')"}, "")))
  296. if "" != id {
  297. return feed.findEntry(func(entry *Entry) bool { return id == entry.Id })
  298. }
  299. return feed.findEntry(nil)
  300. }
  301. func (feed *Feed) deleteEntryById(id Id) *Entry {
  302. if i, entry := feed.findEntryById(id); i < 0 {
  303. return nil
  304. } else {
  305. a := feed.Entries
  306. // https://github.com/golang/go/wiki/SliceTricks
  307. copy(a[i:], a[i+1:])
  308. // a[len(a)-1] = nil // or the zero value of T
  309. feed.Entries = a[:len(a)-1]
  310. feed.Updated = iso8601(time.Now())
  311. // don' try to be smart. When removing days feeds, we rely on correct Published date.
  312. // entry.Published = iso8601{time.Time{}}
  313. // entry.Updated = entry.Published
  314. return entry
  315. }
  316. }
  317. func (feed Feed) SaveToFile(dst string) error {
  318. defer un(trace("Feed.SaveToFile"))
  319. sort.Sort(ByPublishedDesc(feed.Entries))
  320. // remove deleted entries? Maybe Published date zero.
  321. tmp := dst + "~"
  322. var err error
  323. var w *os.File
  324. if w, err = os.Create(tmp); err == nil {
  325. enc := xml.NewEncoder(w)
  326. enc.Indent("", " ")
  327. if err = enc.Encode(feed); err == nil {
  328. if err = enc.Flush(); err == nil {
  329. if err = w.Close(); err == nil {
  330. if err = os.Rename(dst, dst+".bak"); err == nil || os.IsNotExist(err) {
  331. if err = os.Rename(tmp, dst); err == nil {
  332. return nil
  333. }
  334. }
  335. }
  336. }
  337. }
  338. }
  339. return err
  340. }
  341. // Validate for storage
  342. func (entry *Entry) Validate() error {
  343. if "" == entry.Id {
  344. return errors.New("Entry may not have empty Id.")
  345. }
  346. if 1 < len(entry.Links) {
  347. return fmt.Errorf("Entry may not have more than one link. Entry.Id='%s'", entry.Id)
  348. }
  349. if 1 == len(entry.Links) {
  350. if "" == entry.Links[0].Href {
  351. return fmt.Errorf("Entry may not have empty link. Entry.Id='%s'", entry.Id)
  352. }
  353. url := mustParseURL(entry.Links[0].Href)
  354. if !url.IsAbs() {
  355. return fmt.Errorf("Entry must have absolute Link. Entry.Id='%s'", entry.Id)
  356. }
  357. if "" == url.Host {
  358. return fmt.Errorf("Entry must have Link with non-empty host. Entry.Id='%s'", entry.Id)
  359. }
  360. }
  361. return nil
  362. }
  363. func (entry Entry) CategoriesMerged() []Category {
  364. a := entry.Title.Categories()
  365. b := entry.Content.Categories()
  366. ret := make([]Category, 0, len(a)+len(b)+len(entry.Categories))
  367. ret = append(ret, a...)
  368. ret = append(ret, b...)
  369. ret = append(ret, entry.Categories...)
  370. sort.Slice(ret, func(i, j int) bool { return strings.Compare(ret[i].Term, ret[j].Term) < 0 })
  371. return uniqCategory(ret)
  372. }
  373. func AggregateCategories(entries []*Entry) []Category {
  374. // aggregate & count feed entry categories
  375. cats := make(map[string]int, 1*len(entries)) // raw len guess
  376. for _, ent := range entries {
  377. for _, cat := range ent.Categories {
  378. cats[cat.Term] += 1
  379. }
  380. }
  381. cs := make([]Category, 0, len(cats))
  382. for term, count := range cats {
  383. if term != "" && count != 0 {
  384. cs = append(cs, Category{Term: term, Label: strconv.Itoa(count)})
  385. }
  386. }
  387. sort.Slice(cs, func(i, j int) bool {
  388. return strings.Compare(cs[i].Term, cs[j].Term) < 0
  389. })
  390. return cs
  391. }
  392. func uniqCategory(data []Category) []Category {
  393. ret := make([]Category, 0, len(data))
  394. for i, e := range data {
  395. if "" == e.Term {
  396. continue
  397. }
  398. if i == 0 || e.Term != data[i-1].Term {
  399. ret = append(ret, e)
  400. }
  401. }
  402. return ret
  403. }
  404. func (ht HumanText) Categories() []Category {
  405. ret := make([]Category, 0, 10)
  406. for t := range tagsFromString(ht.Body) {
  407. ret = append(ret, Category{Term: t})
  408. }
  409. return ret
  410. }
  411. // https://stackoverflow.com/a/39425959
  412. func isEmojiRune(ru rune) bool {
  413. r := int(ru)
  414. return false ||
  415. (0x2b50 <= r && r <= 0x2b50) || // star
  416. (0x1F600 <= r && r <= 0x1F64F) || // Emoticons
  417. (0x1F300 <= r && r <= 0x1F5FF) || // Misc Symbols and Pictographs
  418. (0x1F680 <= r && r <= 0x1F6FF) || // Transport and Map
  419. (0x1F1E6 <= r && r <= 0x1F1FF) || // Regional country flags
  420. (0x2600 <= r && r <= 0x26FF) || // Misc symbols
  421. (0x2700 <= r && r <= 0x27BF) || // Dingbats
  422. (0xFE00 <= r && r <= 0xFE0F) || // Variation Selectors
  423. (0x1F900 <= r && r <= 0x1F9FF) || // Supplemental Symbols and Pictographs
  424. (0x1f018 <= r && r <= 0x1f270) || // Various asian characters
  425. (0xfe00 <= r && r <= 0xfe0f) || // Variation selector
  426. (0x238c <= r && r <= 0x2454) || // Misc items
  427. (0x20d0 <= r && r <= 0x20ff) // Combining Diacritical Marks for Symbols
  428. }
  429. func isTag(s string) bool {
  430. for _, r := range s {
  431. if '#' == r {
  432. return true
  433. }
  434. return isEmojiRune(r)
  435. }
  436. return false
  437. }
  438. func tagsFromString(str string) map[string]struct{} {
  439. scanner := bufio.NewScanner(strings.NewReader(str))
  440. scanner.Split(bufio.ScanWords)
  441. ret := make(map[string]struct{}, 10)
  442. for scanner.Scan() {
  443. term := scanner.Text()
  444. if !isTag(term) {
  445. continue
  446. }
  447. term = strings.TrimLeft(term, "#")
  448. term = strings.TrimRightFunc(term, func(r rune) bool {
  449. return !('§' == r || '†' == r) && unicode.IsPunct(r)
  450. })
  451. if "" != term {
  452. ret[term] = struct{}{}
  453. }
  454. }
  455. return ret
  456. }
  457. const iWillBeALineFeedMarker = "+,zX@D4X#%`lGdX-vWU?/==v"
  458. func cleanLegacyContent(txt string) string {
  459. src := strings.Replace(txt, "<br />", iWillBeALineFeedMarker, -1)
  460. if node, err := html.Parse(strings.NewReader(src)); err == nil {
  461. str := strings.Replace(scrape.Text(node), iWillBeALineFeedMarker, "", -1)
  462. return strings.Trim(str[:len(str)-len("( Permalink )")], " ")
  463. } else {
  464. return err.Error()
  465. }
  466. }