feedwriter.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. //
  2. // Copyright (C) 2017-2018 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. //
  17. package main
  18. import (
  19. "compress/gzip"
  20. "encoding/json"
  21. "encoding/xml"
  22. "errors"
  23. "fmt"
  24. "io/ioutil"
  25. "log"
  26. "net/url"
  27. "os"
  28. "path"
  29. "path/filepath"
  30. "regexp"
  31. "sort"
  32. "strconv"
  33. "strings"
  34. "time"
  35. )
  36. func mustParseURL(u string) *url.URL {
  37. if ret, err := url.Parse(u); err != nil {
  38. panic("Cannot parse URL '" + u + "' " + err.Error())
  39. } else {
  40. return ret
  41. }
  42. }
  43. const cgiName = "shaarligo.cgi"
  44. const dirAssets = "assets"
  45. const dirApp = "app"
  46. const uriPub = "o"
  47. const uriPosts = "p"
  48. const uriDays = "d"
  49. const uriTags = "t"
  50. const relSelf = "self" // https://www.iana.org/assignments/link-relations/link-relations.xhtml
  51. const relAlternate = "alternate" // https://www.iana.org/assignments/link-relations/link-relations.xhtml
  52. const relVia = "via" // Atom https://tools.ietf.org/html/rfc4287
  53. const relEnclosure = "enclosure" // Atom https://tools.ietf.org/html/rfc4287
  54. const relFirst = "first" // paged feeds https://tools.ietf.org/html/rfc5005#section-3
  55. const relLast = "last" // paged feeds https://tools.ietf.org/html/rfc5005#section-3
  56. const relNext = "next" // paged feeds https://tools.ietf.org/html/rfc5005#section-3
  57. const relPrevious = "previous" // paged feeds https://tools.ietf.org/html/rfc5005#section-3
  58. const relEdit = "edit" // AtomPub https://tools.ietf.org/html/rfc5023
  59. const relEditMedia = "edit-media" // AtomPub https://tools.ietf.org/html/rfc5023
  60. const relUp = "up" // https://www.iana.org/assignments/link-relations/link-relations.xhtml
  61. const relSearch = "search" // http://www.opensearch.org/Specifications/OpenSearch/1.1#Autodiscovery_in_RSS.2FAtom
  62. const newDirPerms = 0775
  63. var rexPath = regexp.MustCompile("[^/]+")
  64. const uriPubPosts = uriPub + "/" + uriPosts + "/"
  65. const uriPubTags = uriPub + "/" + uriTags + "/"
  66. const uriPubDays = uriPub + "/" + uriDays + "/"
  67. func uri2subtitle(subtitle *HumanText, uri string) *HumanText {
  68. if strings.HasPrefix(uri, uriPubTags) {
  69. return &HumanText{Body: "#" + strings.TrimRight(uri[len(uriPubTags):], "/")}
  70. }
  71. if strings.HasPrefix(uri, uriPubDays) {
  72. return &HumanText{Body: "📅 " + strings.TrimRight(uri[len(uriPubDays):], "/")}
  73. }
  74. return subtitle
  75. }
  76. func (entry Entry) FeedFilters(uri2filter map[string]func(*Entry) bool) map[string]func(*Entry) bool {
  77. // defer un(trace("Entry.FeedFilters " + entry.Id))
  78. if nil == uri2filter {
  79. uri2filter = make(map[string]func(*Entry) bool, 10)
  80. }
  81. uri2filter[uriPubPosts] = func(*Entry) bool { return true }
  82. uri2filter[uriPubPosts+entry.Id+"/"] = func(iEntry *Entry) bool { return entry.Id == iEntry.Id }
  83. uri2filter[uriPubTags] = func(*Entry) bool { return false } // dummy to get an (empty) feed
  84. for _, cat := range entry.Categories {
  85. trm := cat.Term
  86. uri2filter[uriPubTags+trm+"/"] = func(iEntry *Entry) bool {
  87. for _, iCat := range iEntry.Categories {
  88. if trm == iCat.Term { // && cat.Scheme == iCat.Scheme {
  89. return true
  90. }
  91. }
  92. return false
  93. }
  94. }
  95. // uri2filter["pub/days/", func(*Entry) bool { return false })
  96. dayStr := entry.Published.Format(time.RFC3339[:10])
  97. uri2filter[uriPubDays+dayStr+"/"] = func(iEntry *Entry) bool {
  98. return dayStr == iEntry.Published.Format(time.RFC3339[:10])
  99. }
  100. return uri2filter
  101. }
  102. func LinkRel(rel string, links []Link) Link {
  103. for _, l := range links {
  104. for _, r := range strings.Fields(l.Rel) { // may be worth caching
  105. if rel == r {
  106. return l
  107. }
  108. }
  109. }
  110. return Link{}
  111. }
  112. func LinkRelSelf(links []Link) Link {
  113. return LinkRel(relSelf, links)
  114. }
  115. func uriSliceSorted(uri2filter map[string]func(*Entry) bool) []string {
  116. keys := make([]string, len(uri2filter))
  117. {
  118. i := 0
  119. for k := range uri2filter {
  120. keys[i] = k
  121. i++
  122. }
  123. }
  124. sort.Strings(keys) // I don't care too much how they're sorted, I just want them to be stable.
  125. return keys
  126. }
  127. // collect all entries into all (unpaged, complete) feeds to publish.
  128. //
  129. // return sorted by Id
  130. func (seed Feed) CompleteFeeds(uri2filter map[string]func(*Entry) bool) []Feed {
  131. defer un(trace("Feed.CompleteFeeds"))
  132. ret := make([]Feed, 0, len(uri2filter))
  133. for _, uri := range uriSliceSorted(uri2filter) {
  134. entryFilter := uri2filter[uri]
  135. feed := seed // clone
  136. feed.Id = uri
  137. feed.Subtitle = uri2subtitle(feed.Subtitle, uri)
  138. feed.Entries = nil // save reallocs?
  139. for _, entry := range seed.Entries {
  140. if entryFilter(entry) {
  141. feed.Entries = append(feed.Entries, entry)
  142. }
  143. }
  144. if uriPubTags == uri {
  145. feed.Categories = AggregateCategories(seed.Entries) // rather the ones from o/p
  146. }
  147. ret = append(ret, feed)
  148. }
  149. return ret
  150. }
  151. func appendPageNumber(prefix string, page, pageCount int) string {
  152. if !strings.HasSuffix(prefix, "/") {
  153. panic("invalid input: appendPageNumber('" + prefix + "', " + string(page) + ") needs a trailing slash")
  154. }
  155. if page == pageCount-1 {
  156. return prefix
  157. }
  158. return fmt.Sprintf("%s"+"-"+"%d"+"/", prefix[:len(prefix)-1], page)
  159. }
  160. func computePageCount(count int, entriesPerPage int) int {
  161. if count == 0 {
  162. // even 0 entries need one (empty) page
  163. return 1
  164. }
  165. return 1 + (count-1)/entriesPerPage
  166. }
  167. func (seed Feed) Pages(entriesPerPage int) []Feed {
  168. // defer un(trace("Feed.Pages " + seed.Id))
  169. entriesPerPage = max(1, entriesPerPage)
  170. totalEntries := len(seed.Entries)
  171. pageCount := computePageCount(totalEntries, entriesPerPage)
  172. ret := make([]Feed, 0, pageCount)
  173. uri := seed.Id
  174. link := func(rel string, page int) Link {
  175. return Link{Rel: rel, Href: appendPageNumber(uri, page, pageCount), Title: strconv.Itoa(page + 1)}
  176. }
  177. lower := totalEntries // start past the oldest entry
  178. for page := 0; page < pageCount; page++ {
  179. feed := seed
  180. {
  181. upper := lower
  182. step := entriesPerPage
  183. if page == pageCount-2 {
  184. // only the page BEFORE the last one has variable length (if needed)
  185. step = totalEntries % entriesPerPage
  186. if 0 == step {
  187. step = entriesPerPage
  188. }
  189. }
  190. lower = max(0, upper-step)
  191. feed.Entries = seed.Entries[lower:upper]
  192. feed.Updated = iso8601(time.Time{}) // start with zero
  193. for _, ent := range feed.Entries { // max of entries
  194. if feed.Updated.Before(ent.Updated) {
  195. feed.Updated = ent.Updated
  196. }
  197. }
  198. }
  199. ls := append(make([]Link, 0, len(feed.Links)+5), feed.Links...)
  200. ls = append(ls, link(relSelf, page))
  201. // https://tools.ietf.org/html/rfc5005#section-3
  202. if pageCount > 1 {
  203. ls = append(ls, link(relLast, 0)) // oldest, i.e. lowest page number
  204. if page > 0 {
  205. ls = append(ls, link(relNext, page-1)) // older, i.e. smaller page number
  206. }
  207. if page < pageCount-1 {
  208. ls = append(ls, link(relPrevious, page+1)) // newer, i.e. higher page number
  209. }
  210. ls = append(ls, link(relFirst, pageCount-1)) // newest, i.e. largest page number
  211. } else {
  212. // TODO https://tools.ietf.org/html/rfc5005#section-2
  213. // xmlns:fh="http://purl.org/syndication/history/1.0" <fh:complete/>
  214. }
  215. feed.Links = ls
  216. ret = append(ret, feed)
  217. }
  218. return ret
  219. }
  220. func (feed Feed) CompleteFeedsForModifiedEntries(entries []*Entry) []Feed {
  221. // defer un(trace("Feed.CompleteFeedsForModifiedEntries"))
  222. var uri2filter map[string]func(*Entry) bool
  223. for _, entry := range entries {
  224. uri2filter = entry.FeedFilters(uri2filter)
  225. }
  226. if feed.Updated.IsZero() {
  227. feed.Updated = func() iso8601 {
  228. if len(feed.Entries) > 0 {
  229. ent := feed.Entries[0]
  230. if !ent.Updated.IsZero() {
  231. return ent.Updated
  232. }
  233. if !ent.Published.IsZero() {
  234. return ent.Published
  235. }
  236. }
  237. return iso8601(time.Now())
  238. }()
  239. }
  240. return feed.CompleteFeeds(uri2filter)
  241. }
  242. func (feed Feed) PagedFeeds(complete []Feed, linksPerPage int) ([]Feed, error) {
  243. defer un(trace("Feed.PagedFeeds"))
  244. xmlBase := mustParseURL(feed.XmlBase)
  245. if !xmlBase.IsAbs() || !strings.HasSuffix(xmlBase.Path, "/") {
  246. log.Printf("xml:base is '%s'\n", xmlBase)
  247. return []Feed{}, errors.New("feed/@xml:base must be set to an absolute URL with a trailing slash")
  248. }
  249. pages := make([]Feed, 0, 2*len(complete))
  250. for _, comp := range complete {
  251. pages = append(pages, comp.Pages(linksPerPage)...)
  252. }
  253. // do before writing but after all matching is done:
  254. catScheme := xmlBase.ResolveReference(mustParseURL(path.Join(uriPub, uriTags))).String() + "/"
  255. for _, entry := range feed.Entries {
  256. entry.XmlBase = xmlBase.String()
  257. if entry.Updated.IsZero() {
  258. entry.Updated = entry.Published
  259. }
  260. // change entries for output but don't save the change:
  261. upURL := mustParseURL(path.Join(uriPub, uriPosts) + "/")
  262. selfURL := mustParseURL(path.Join(uriPub, uriPosts, entry.Id) + "/")
  263. editURL := strings.Join([]string{cgiName, "?post=", selfURL.String()}, "")
  264. entry.Id = xmlBase.ResolveReference(selfURL).String() // expand XmlBase as required by https://validator.w3.org/feed/check.cgi?url=
  265. entry.Links = append(entry.Links,
  266. Link{Rel: relSelf, Href: selfURL.String()},
  267. Link{Rel: relEdit, Href: editURL},
  268. // Link{Rel: relEditMedia, Href: editURL},
  269. Link{Rel: relUp, Href: upURL.String(), Title: feed.Title.Body}, // we need the feed-name somewhere.
  270. )
  271. for i, _ := range entry.Categories {
  272. entry.Categories[i].Scheme = catScheme
  273. }
  274. }
  275. return pages, nil
  276. }
  277. func (app App) PublishFeedsForModifiedEntries(feed Feed, entries []*Entry) error {
  278. defer un(trace("App.PublishFeedsForModifiedEntries"))
  279. feed.Generator = &Generator{Uri: myselfNamespace, Version: version + "+" + GitSHA1, Body: "ShaarliGo"}
  280. sort.Sort(ByPublishedDesc(feed.Entries))
  281. // entries = feed.Entries // force write all entries. Every single one.
  282. complete := feed.CompleteFeedsForModifiedEntries(entries)
  283. if pages, err := feed.PagedFeeds(complete, app.cfg.LinksPerPage); err == nil {
  284. if err = app.PublishFeeds(pages, true); err != nil {
  285. return err
  286. } else {
  287. // just assure ALL entries index.xml.gz exist and are up to date
  288. for _, ent := range feed.Entries {
  289. if err = app.PublishEntry(ent, false); err != nil { // only if newer
  290. return err
  291. }
  292. }
  293. return nil
  294. }
  295. } else {
  296. return err
  297. }
  298. }
  299. // create a lock file to avoid races and then call PublishFeed in loop
  300. func (app App) PublishFeeds(feeds []Feed, force bool) error {
  301. defer un(trace("App.PublishFeeds"))
  302. strFileLock := filepath.Join(dirApp, "var", "lock")
  303. // check race: if .lock exists kill pid?
  304. if byPid, err := ioutil.ReadFile(strFileLock); err == nil {
  305. if pid, err := strconv.Atoi(string(byPid)); err == nil {
  306. if proc, err := os.FindProcess(pid); err == nil {
  307. err = proc.Kill()
  308. }
  309. }
  310. if err != nil {
  311. return err
  312. }
  313. if err = os.Remove(strFileLock); err != nil {
  314. return err
  315. }
  316. }
  317. // create .lock file with pid
  318. if err := ioutil.WriteFile(strFileLock, []byte(string(os.Getpid())), os.ModeExclusive); err == nil {
  319. defer os.Remove(strFileLock)
  320. for _, feed := range feeds {
  321. if err := app.PublishFeed(feed, force); err != nil {
  322. return err
  323. }
  324. if uriPubTags == LinkRelSelf(feed.Links).Href {
  325. // write additional index.json with all (public) category terms
  326. const jsonFileName = "index.json"
  327. tags := make([]string, 0, len(feed.Categories))
  328. for _, cat := range feed.Categories {
  329. tags = append(tags, "#"+cat.Term)
  330. }
  331. dstDirName := filepath.FromSlash(uriPubTags)
  332. dstFileName := filepath.Join(dstDirName, jsonFileName)
  333. tmpFileName := dstFileName + "~"
  334. var w *os.File
  335. if w, err = os.Create(tmpFileName); err == nil {
  336. defer w.Close() // just to be sure
  337. enc := json.NewEncoder(w)
  338. if err = enc.Encode(tags); err == nil {
  339. if err = w.Close(); err == nil {
  340. if err := os.Rename(tmpFileName, dstFileName); err != nil {
  341. return err
  342. }
  343. }
  344. }
  345. }
  346. }
  347. }
  348. }
  349. return nil
  350. }
  351. func (app App) PublishFeed(feed Feed, force bool) error {
  352. const feedFileName = "index.xml.gz"
  353. const xsltFileName = "posts.xslt"
  354. uri := LinkRelSelf(feed.Links).Href
  355. ti, to := trace(strings.Join([]string{"App.PublishFeed", uri}, " "))
  356. pathPrefix := rexPath.ReplaceAllString(uri, "..")
  357. dstDirName := filepath.FromSlash(uri)
  358. dstFileName := filepath.Join(dstDirName, feedFileName)
  359. remove := ((1 == len(feed.Entries) && feed.Entries[0].Published.IsZero()) ||
  360. 0 == len(feed.Entries)) &&
  361. "../../../" == pathPrefix
  362. if remove {
  363. log.Printf("remove %s", dstFileName)
  364. err := os.Remove(dstFileName)
  365. os.Remove(dstDirName)
  366. defer un(ti, to)
  367. return err
  368. }
  369. feed.Id = feed.XmlBase + feed.Id
  370. mTime := time.Time(feed.Updated)
  371. var feedOrEntry interface{} = feed
  372. if "../../../" == pathPrefix && strings.HasPrefix(uri, uriPubPosts) {
  373. if 0 == len(feed.Entries) {
  374. return fmt.Errorf("Invalid feed, self: %v len(entries): %d", uri, len(feed.Entries))
  375. }
  376. if 1 < len(feed.Entries) {
  377. log.Printf("%d entries with Id: %v, keeping just one.", len(feed.Entries), uri)
  378. }
  379. ent := feed.Entries[0]
  380. feedOrEntry = ent
  381. mTime = time.Time(ent.Updated)
  382. }
  383. if fi, err := os.Stat(dstFileName); !force && (fi != nil && !fi.ModTime().Before(mTime)) && !os.IsNotExist(err) {
  384. // log.Printf("skip %s, still up to date.", dstFileName)
  385. return err
  386. }
  387. defer un(ti, to)
  388. tmpFileName := dstFileName + "~"
  389. xslt := path.Join(pathPrefix, dirAssets, app.cfg.Skin, xsltFileName)
  390. var err error
  391. if err = os.MkdirAll(dstDirName, newDirPerms); err == nil {
  392. var gz *os.File
  393. if gz, err = os.Create(tmpFileName); err == nil {
  394. defer gz.Close() // just to be sure
  395. var w *gzip.Writer
  396. if w, err = gzip.NewWriterLevel(gz, gzip.BestCompression); err == nil {
  397. defer w.Close() // just to be sure
  398. enc := xml.NewEncoder(w)
  399. enc.Indent("", " ")
  400. if err = xmlEncodeWithXslt(feedOrEntry, xslt, enc); err == nil {
  401. if err = enc.Flush(); err == nil {
  402. if err = w.Close(); err == nil {
  403. os.Chtimes(tmpFileName, mTime, mTime)
  404. return os.Rename(tmpFileName, dstFileName)
  405. }
  406. }
  407. }
  408. }
  409. }
  410. }
  411. return err
  412. }
  413. func (app App) PublishEntry(ent *Entry, force bool) error {
  414. const feedFileName = "index.xml.gz"
  415. const xsltFileName = "posts.xslt"
  416. uri := LinkRelSelf(ent.Links).Href
  417. ti, to := trace(strings.Join([]string{"App.PublishEntry", uri}, " "))
  418. pathPrefix := rexPath.ReplaceAllString(uri, "..")
  419. dstDirName := filepath.FromSlash(uri)
  420. dstFileName := filepath.Join(dstDirName, feedFileName)
  421. var feedOrEntry interface{} = ent
  422. ent.Id = ent.XmlBase + ent.Id
  423. mTime := time.Time(ent.Updated)
  424. if fi, err := os.Stat(dstFileName); !force && (fi != nil && !fi.ModTime().Before(mTime)) && !os.IsNotExist(err) {
  425. // log.Printf("skip %s, still up to date.", dstFileName)
  426. return err
  427. }
  428. defer un(ti, to)
  429. tmpFileName := dstFileName + "~"
  430. xslt := path.Join(pathPrefix, dirAssets, app.cfg.Skin, xsltFileName)
  431. var err error
  432. if err = os.MkdirAll(dstDirName, newDirPerms); err == nil {
  433. var gz *os.File
  434. if gz, err = os.Create(tmpFileName); err == nil {
  435. defer gz.Close() // just to be sure
  436. var w *gzip.Writer
  437. if w, err = gzip.NewWriterLevel(gz, gzip.BestCompression); err == nil {
  438. defer w.Close() // just to be sure
  439. enc := xml.NewEncoder(w)
  440. enc.Indent("", " ")
  441. if err = xmlEncodeWithXslt(feedOrEntry, xslt, enc); err == nil {
  442. if err = enc.Flush(); err == nil {
  443. if err = w.Close(); err == nil {
  444. os.Chtimes(tmpFileName, mTime, mTime)
  445. return os.Rename(tmpFileName, dstFileName)
  446. }
  447. }
  448. }
  449. }
  450. }
  451. }
  452. return err
  453. }