diff options
author | Ted Unangst <tedu@tedunangst.com> | 2022-03-26 01:46:55 -0400 |
---|---|---|
committer | Ted Unangst <tedu@tedunangst.com> | 2022-03-26 01:46:55 -0400 |
commit | a5667801a3282cc7ca04eea670986a75b824bef2 (patch) | |
tree | dd6c6f36fc18263c7dc9d4275f848149b68e8dfc | |
parent | 8460b97bff37c577973d5c3f8e023746a67fc0f8 (diff) |
save this so we don't lose it
-rw-r--r-- | scripts/script.go | 321 |
1 files changed, 321 insertions, 0 deletions
diff --git a/scripts/script.go b/scripts/script.go new file mode 100644 index 0000000..8eb69f0 --- /dev/null +++ b/scripts/script.go @@ -0,0 +1,321 @@ +package filt + +func Prefilter(req *http.Request) *http.Response { +} + +var nofilter = map[string]bool{ + "news.ycombinator.com": true, +} + +func FilterHTML(w io.Writer, req *http.Request, root *html.Node) bool { + if req.URL.Path == "/" || nofilter[req.URL.Host] { + return false + } + if req.URL.Host == "www.washingtonpost.com" { + findwapoblob(root) + } + w.Write(prolog) + sel := cascadia.MustCompile("article") + articles := sel.MatchAll(root) + if len(articles) > 0 { + w.Write(prolog) + for _, a := range articles { + clean(w, a, req.URL) + } + return true + } + clean(w, root, req.URL) + return true +} + +var startingblob = 0 +var blobarray []interface{} + +func findwapoblob(root *html.Node) { + script := cascadia.MustCompile("script#__NEXT_DATA__").MatchFirst(root) + if script == nil { + return + } + if c := script.FirstChild; c != nil && c.Type == html.TextNode { + j := strings.NewReader(c.Data) + decoder := json.NewDecoder(j) + var jmap map[string]interface{} + err := decoder.Decode(&jmap) + if err != nil { + return + } + //encoder := json.NewEncoder(os.Stdout) + //encoder.SetIndent("", " ") + //encoder.Encode(jmap) + contents := jsonfind(jmap, []string{"props", "pageProps", "globalContent", "content_elements"}) + if contents != nil { + blobarray = contents.([]interface{}) + } + } +} + +var prolog = []byte(` +<!doctype html> +<html> +<head> +<meta charset="utf-8"> +<style> +body { + background: #111; + color: #fed; + max-width: 940px; + margin: auto; + padding: 1em; + font-size: 1.4em; + line-height: 1.5; +} +img { + max-width: 100%; +} +a { + color: #cef; +} +pre { + white-space: pre-wrap; + word-wrap: break-word; +} +</style> +`) + +var permittedtags = []string{"title", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6", + "table", "thead", "tbody", "th", "tr", "td", + "p", "br", "pre", "code", "blockquote", + "strong", "em", "b", "i", "u", "s", "sup", "del", + "ol", "ul", "li"} +var permittedattr = []string{"colspan", "rowspan", "id"} +var bannedtags = []string{"script", "style"} + +func init() { + sort.Strings(permittedtags) + sort.Strings(permittedattr) + sort.Strings(bannedtags) +} + +func contains(array []string, tag string) bool { + idx := sort.SearchStrings(array, tag) + return idx < len(array) && array[idx] == tag +} + +func getattr(node *html.Node, attr string) string { + for _, a := range node.Attr { + if a.Key == attr { + return a.Val + } + } + return "" +} + +func hasclass(node *html.Node, class string) bool { + classes := getattr(node, class) + for _, c := range strings.Split(classes, " ") { + if c == class { + return true + } + } + return false +} + +func writetag(w io.Writer, node *html.Node) { + io.WriteString(w, "<") + io.WriteString(w, node.Data) + for _, attr := range node.Attr { + if contains(permittedattr, attr.Key) { + fmt.Fprintf(w, ` %s="%s"`, attr.Key, html.EscapeString(attr.Val)) + } + } + io.WriteString(w, ">") +} + +func jsonfind(jmap map[string]interface{}, keys []string) interface{} { + var ii interface{} + ii = jmap + for _, key := range keys { + idx, err := strconv.Atoi(key) + if err == nil { + m := ii.([]interface{}) + ii = m[idx] + } else { + m := ii.(map[string]interface{}) + ii = m[key] + } + if ii == nil { + return nil + } + } + return ii +} + +func extractblogshit(script string, baseurl *url.URL) *html.Node { + re := regexp.MustCompile("blogspot.*'postId': '([0-9]+)'") + m := re.FindStringSubmatch(script) + if m == nil { + return nil + } + targ := fmt.Sprintf("%s://%s/feeds/posts/default/%s?alt=json", baseurl.Scheme, baseurl.Hostname(), m[1]) + log.Printf("now i'm getting real content from %s", targ) + jr, err := http.Get(targ) + if err != nil { + log.Println("json post err", err) + return nil + } + defer jr.Body.Close() + decoder := json.NewDecoder(jr.Body) + var jmap map[string]interface{} + err = decoder.Decode(&jmap) + if err != nil { + log.Println("json decode error") + return nil + } + body := jsonfind(jmap, []string{"entry", "content", "$t"}).(string) + root, _ := html.Parse(strings.NewReader(body)) + return root +} + +func procscriptnode(w io.Writer, script *html.Node, baseurl *url.URL) { + src := getattr(script, "src") + if src != "" { + scripturl, err := url.Parse(src) + if err != nil { + log.Println("frame err", err) + return + } + if scripturl.Host == "gist.github.com" { + path := scripturl.Path + if strings.HasSuffix(path, ".js") { + path = path[:len(path)-3] + "/raw" + } + scripturl.Path = path + txt, err := http.Get(scripturl.String()) + if err != nil { + log.Println("frame err", err) + return + } + defer txt.Body.Close() + var buf bytes.Buffer + io.Copy(&buf, txt.Body) + io.WriteString(w, `<pre style="border: 2px solid black; padding: 1em;">`) + html_template.HTMLEscape(w, buf.Bytes()) + io.WriteString(w, `</pre>`) + } + } else { + if c := script.FirstChild; c != nil && c.Type == html.TextNode { + article := extractblogshit(c.Data, baseurl) + if article != nil { + clean(w, article, baseurl) + } + } + } +} + +func clean(w io.Writer, node *html.Node, baseurl *url.URL) { + switch node.Type { + case html.ElementNode: + tag := node.Data + switch { + case tag == "a": + fmt.Fprintf(w, `<a href="%s" id="%s">`, + html.EscapeString(getattr(node, "href")), + html.EscapeString(getattr(node, "id"))) + case tag == "img": + src := getattr(node, "src") + if getattr(node, "data-native-src") != "" { + src = getattr(node, "data-native-src") + if strings.HasSuffix(src, "-1x-1.jpg") { + src = src[:len(src)-9] + "1200x-1.jpg" + } + } else if getattr(node, "data-raw-src") != "" { + src = getattr(node, "data-raw-src") + } else if getattr(node, "data-runner-src") != "" { + src = getattr(node, "data-runner-src") + } + alt := getattr(node, "alt") + + if strings.Contains(getattr(node, "class"), "progressiveMedia-thumbnail") { + src = "" + } + if src != "" { + fmt.Fprintf(w, `<img src="%s" alt="%s">`, html.EscapeString(src), html.EscapeString(alt)) + } + case tag == "picture": + src := "" + for c := node.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode && c.Data == "source" { + attr := getattr(c, "data-srcset") + if attr != "" { + src = attr + } + } + } + if src != "" { + fmt.Fprintf(w, `<picture><img src="%s"></picture>`, + html.EscapeString(src)) + return + } + case tag == "figure": + case tag == "iframe": + case tag == "noscript": + // see https://github.com/golang/go/issues/16318 + if c := node.FirstChild; c != nil { + rereader := strings.NewReader(c.Data) + root, err := html.Parse(rereader) + if err == nil { + clean(w, root, baseurl) + } + } + return + case tag == "script": + procscriptnode(w, node, baseurl) + return + case tag == "link": + rel := getattr(node, "rel") + if rel == "alternate" || rel == "icon" { + html.Render(w, node) + } + case contains(permittedtags, tag): + if hasclass(node, "hide-for-print") { + return + } + if qa := getattr(node, "data-qa"); qa != "" { + if qa == "article-body-ad" { + return + } + if qa == "article-image" { + for i := startingblob; i < len(blobarray); i++ { + m := blobarray[i].(map[string]interface{}) + if m["type"].(string) == "image" { + src := m["url"].(string) + src = fmt.Sprintf("https://www.washingtonpost.com/wp-apps/imrs.php?src=%s&w=916", src) + alt := m["credits_caption_display"].(string) + fmt.Fprintf(w, `<img src="%s"><p>%s`, html.EscapeString(src), html.EscapeString(alt)) + startingblob = i+1 + return + } + } + } + } + writetag(w, node) + case contains(bannedtags, tag): + return + } + case html.TextNode: + io.WriteString(w, html.EscapeString(node.Data)) + } + for c := node.FirstChild; c != nil; c = c.NextSibling { + clean(w, c, baseurl) + } + if node.Type == html.ElementNode { + tag := node.Data + if tag == "a" || (contains(permittedtags, tag) && tag != "br") { + fmt.Fprintf(w, "</%s>", tag) + } + if tag == "p" || tag == "div" { + io.WriteString(w, "\n") + } + } +} + |