diff options
author | Matthew Sotoudeh <matthewsot@outlook.com> | 2022-02-07 21:02:19 -0800 |
---|---|---|
committer | Matthew Sotoudeh <matthewsot@outlook.com> | 2022-02-07 21:02:19 -0800 |
commit | fb2865d4144ca80c9ebcb589f4692f27640f2e2f (patch) | |
tree | 781024e7c0d69a790a62025ade83895d5e0f60bf | |
parent | a136b96913f90570e945e62a9c04827b415f0fd7 (diff) |
Rudimentary support for CDATA in the RSS parser
-rw-r--r-- | sources/rss-source.rkt | 25 |
1 files changed, 18 insertions, 7 deletions
diff --git a/sources/rss-source.rkt b/sources/rss-source.rkt index 8f087b1..a2a83fc 100644 --- a/sources/rss-source.rkt +++ b/sources/rss-source.rkt @@ -20,14 +20,25 @@ (struct rss-channel (title description items) #:transparent) +(define (stringifyable xexpr) (or (string? xexpr) (cdata? xexpr))) +(define (stringify xexpr) + (cond + [(string? xexpr) xexpr] + [(cdata? xexpr) (let* ([str (cdata-string xexpr)] + [len (string-length str)] + [prefix-len (string-length "<![CDATA]")] + [suffix-len (string-length "]]>")]) + (substring str prefix-len (- len suffix-len)))] + [else #f])) + (define (parse-rss xexpr) ; Some RSS feeds drop the items directly under the <rdf> (see: arXiv) while ; others wrap them in a <channel>. This picks up both. (cons (parse-channel xexpr) (map parse-channel (find-children 'channel xexpr)))) (define (parse-channel xexpr) - (rss-channel (se-path* '(channel title) xexpr) - (se-path* '(channel description) xexpr) + (rss-channel (stringify (se-path* '(channel title) xexpr)) + (stringify (se-path* '(channel description) xexpr)) (filter (curry assoc "title") ; Youtube uses entry, not item (append (map parse-item (find-children 'entry xexpr)) @@ -40,12 +51,12 @@ (match xexprs ['() '()] [`((title ,attrs ,title) ,rest ...) - #:when (and (string? title) (not (member 'title already-seen))) - `(("title" . ,title) . - ,(parse-out-attrs rest `(title . already-seen)))] + #:when (and (stringifyable title) (not (member 'title already-seen))) + `(("title" . ,(stringify title)) . + ,(parse-out-attrs rest `(title . ,already-seen)))] [`((description ,attrs ,descr) ,rest ...) - #:when (and (string? descr) (not (member 'descr already-seen))) - `(("description" . ,descr) . + #:when (and (stringifyable descr) (not (member 'descr already-seen))) + `(("description" . ,(stringify descr)) . ,(parse-out-attrs rest `(descr . ,already-seen)))] [`((link ,attrs ,url) ,rest ...) #:when (and (string? url) (not (member 'url already-seen))) |