save this so we don't lose it

author: Ted Unangst <tedu@tedunangst.com> 2022-03-26 01:46:55 -0400
committer: Ted Unangst <tedu@tedunangst.com> 2022-03-26 01:46:55 -0400
commit: a5667801a3282cc7ca04eea670986a75b824bef2 (patch)
tree: dd6c6f36fc18263c7dc9d4275f848149b68e8dfc
parent: 8460b97bff37c577973d5c3f8e023746a67fc0f8 (diff)
1 files changed, 321 insertions, 0 deletions
diff --git a/scripts/script.go b/scripts/script.go
new file mode 100644
index 0000000..8eb69f0
--- /dev/null
+++ b/scripts/script.go
@@ -0,0 +1,321 @@
+package filt
+
+func Prefilter(req *http.Request) *http.Response {
+}
+
+var nofilter = map[string]bool{
+	"news.ycombinator.com": true,
+}
+
+func FilterHTML(w io.Writer, req *http.Request, root *html.Node) bool {
+	if req.URL.Path == "/" || nofilter[req.URL.Host] {
+		return false
+	}
+	if req.URL.Host == "www.washingtonpost.com" {
+		findwapoblob(root)
+	}
+	w.Write(prolog)
+	sel := cascadia.MustCompile("article")
+	articles := sel.MatchAll(root)
+	if len(articles) > 0 {
+		w.Write(prolog)
+		for _, a := range articles {
+			clean(w, a, req.URL)
+		}
+		return true
+	}
+	clean(w, root, req.URL)
+	return true
+}
+
+var startingblob = 0
+var blobarray []interface{}
+
+func findwapoblob(root *html.Node) {
+	script := cascadia.MustCompile("script#__NEXT_DATA__").MatchFirst(root)
+	if script == nil {
+		return
+	}
+	if c := script.FirstChild; c != nil && c.Type == html.TextNode {
+		j := strings.NewReader(c.Data)
+		decoder := json.NewDecoder(j)
+		var jmap map[string]interface{}
+		err := decoder.Decode(&jmap)
+		if err != nil {
+			return
+		}
+		//encoder := json.NewEncoder(os.Stdout)
+		//encoder.SetIndent("", "  ")
+		//encoder.Encode(jmap)
+		contents := jsonfind(jmap, []string{"props", "pageProps", "globalContent", "content_elements"})
+		if contents != nil {
+			blobarray = contents.([]interface{})
+		}
+	}
+}
+
+var prolog = []byte(`
+<!doctype html>
+<html>
+<head>
+<meta charset="utf-8">
+<style>
+body {
+	background: #111;
+	color: #fed;
+	max-width: 940px;
+	margin: auto;
+	padding: 1em;
+	font-size: 1.4em;
+	line-height: 1.5;
+}
+img {
+	max-width: 100%;
+}
+a {
+	color: #cef;
+}
+pre {
+	white-space: pre-wrap;
+	word-wrap: break-word;
+}
+</style>
+`)
+
+var permittedtags = []string{"title", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6",
+	"table", "thead", "tbody", "th", "tr", "td",
+	"p", "br", "pre", "code", "blockquote",
+	"strong", "em", "b", "i", "u", "s", "sup", "del",
+	"ol", "ul", "li"}
+var permittedattr = []string{"colspan", "rowspan", "id"}
+var bannedtags = []string{"script", "style"}
+
+func init() {
+	sort.Strings(permittedtags)
+	sort.Strings(permittedattr)
+	sort.Strings(bannedtags)
+}
+
+func contains(array []string, tag string) bool {
+	idx := sort.SearchStrings(array, tag)
+	return idx < len(array) && array[idx] == tag
+}
+
+func getattr(node *html.Node, attr string) string {
+	for _, a := range node.Attr {
+		if a.Key == attr {
+			return a.Val
+		}
+	}
+	return ""
+}
+
+func hasclass(node *html.Node, class string) bool {
+	classes := getattr(node, class)
+	for _, c := range strings.Split(classes, " ") {
+		if c == class {
+			return true
+		}
+	}
+	return false
+}
+
+func writetag(w io.Writer, node *html.Node) {
+	io.WriteString(w, "<")
+	io.WriteString(w, node.Data)
+	for _, attr := range node.Attr {
+		if contains(permittedattr, attr.Key) {
+			fmt.Fprintf(w, ` %s="%s"`, attr.Key, html.EscapeString(attr.Val))
+		}
+	}
+	io.WriteString(w, ">")
+}
+
+func jsonfind(jmap map[string]interface{}, keys []string) interface{} {
+	var ii interface{}
+	ii = jmap
+	for _, key := range keys {
+		idx, err := strconv.Atoi(key)
+		if err == nil {
+			m := ii.([]interface{})
+			ii = m[idx]
+		} else {
+			m := ii.(map[string]interface{})
+			ii = m[key]
+		}
+		if ii == nil {
+			return nil
+		}
+	}
+	return ii
+}
+
+func extractblogshit(script string, baseurl *url.URL) *html.Node {
+	re := regexp.MustCompile("blogspot.*'postId': '([0-9]+)'")
+	m := re.FindStringSubmatch(script)
+	if m == nil {
+		return nil
+	}
+	targ := fmt.Sprintf("%s://%s/feeds/posts/default/%s?alt=json", baseurl.Scheme, baseurl.Hostname(), m[1])
+	log.Printf("now i'm getting real content from %s", targ)
+	jr, err := http.Get(targ)
+	if err != nil {
+		log.Println("json post err", err)
+		return nil
+	}
+	defer jr.Body.Close()
+	decoder := json.NewDecoder(jr.Body)
+	var jmap map[string]interface{}
+	err = decoder.Decode(&jmap)
+	if err != nil {
+		log.Println("json decode error")
+		return nil
+	}
+	body := jsonfind(jmap, []string{"entry", "content", "$t"}).(string)
+	root, _ := html.Parse(strings.NewReader(body))
+	return root
+}
+
+func procscriptnode(w io.Writer, script *html.Node, baseurl *url.URL) {
+	src := getattr(script, "src")
+	if src != "" {
+		scripturl, err := url.Parse(src)
+		if err != nil {
+			log.Println("frame err", err)
+			return
+		}
+		if scripturl.Host == "gist.github.com" {
+			path := scripturl.Path
+			if strings.HasSuffix(path, ".js") {
+				path = path[:len(path)-3] + "/raw"
+			}
+			scripturl.Path = path
+			txt, err := http.Get(scripturl.String())
+			if err != nil {
+				log.Println("frame err", err)
+				return
+			}
+			defer txt.Body.Close()
+			var buf bytes.Buffer
+			io.Copy(&buf, txt.Body)
+			io.WriteString(w, `<pre style="border: 2px solid black; padding: 1em;">`)
+			html_template.HTMLEscape(w, buf.Bytes())
+			io.WriteString(w, `</pre>`)
+		}
+	} else {
+		if c := script.FirstChild; c != nil && c.Type == html.TextNode {
+			article := extractblogshit(c.Data, baseurl)
+			if article != nil {
+				clean(w, article, baseurl)
+			}
+		}
+	}
+}
+
+func clean(w io.Writer, node *html.Node, baseurl *url.URL) {
+	switch node.Type {
+	case html.ElementNode:
+		tag := node.Data
+		switch {
+		case tag == "a":
+			fmt.Fprintf(w, `<a href="%s" id="%s">`,
+				html.EscapeString(getattr(node, "href")),
+				html.EscapeString(getattr(node, "id")))
+		case tag == "img":
+			src := getattr(node, "src")
+			if getattr(node, "data-native-src") != "" {
+				src = getattr(node, "data-native-src")
+				if strings.HasSuffix(src, "-1x-1.jpg") {
+					src = src[:len(src)-9] + "1200x-1.jpg"
+				}
+			} else if getattr(node, "data-raw-src") != "" {
+				src = getattr(node, "data-raw-src")
+			} else if getattr(node, "data-runner-src") != "" {
+				src = getattr(node, "data-runner-src")
+			}
+			alt := getattr(node, "alt")
+
+			if strings.Contains(getattr(node, "class"), "progressiveMedia-thumbnail") {
+				src = ""
+			}
+			if src != "" {
+				fmt.Fprintf(w, `<img src="%s" alt="%s">`, html.EscapeString(src), html.EscapeString(alt))
+			}
+		case tag == "picture":
+			src := ""
+			for c := node.FirstChild; c != nil; c = c.NextSibling {
+				if c.Type == html.ElementNode && c.Data == "source" {
+					attr := getattr(c, "data-srcset")
+					if attr != "" {
+						src = attr
+					}
+				}
+			}
+			if src != "" {
+				fmt.Fprintf(w, `<picture><img src="%s"></picture>`,
+					html.EscapeString(src))
+				return
+			}
+		case tag == "figure":
+		case tag == "iframe":
+		case tag == "noscript":
+			// see https://github.com/golang/go/issues/16318
+			if c := node.FirstChild; c != nil {
+				rereader := strings.NewReader(c.Data)
+				root, err := html.Parse(rereader)
+				if err == nil {
+					clean(w, root, baseurl)
+				}
+			}
+			return
+		case tag == "script":
+			procscriptnode(w, node, baseurl)
+			return
+		case tag == "link":
+			rel := getattr(node, "rel")
+			if rel == "alternate" || rel == "icon" {
+				html.Render(w, node)
+			}
+		case contains(permittedtags, tag):
+			if hasclass(node, "hide-for-print") {
+				return
+			}
+			if qa := getattr(node, "data-qa"); qa != "" {
+				if qa == "article-body-ad" {
+					return
+				}
+				if qa == "article-image" {
+					for i := startingblob; i < len(blobarray); i++ {
+						m := blobarray[i].(map[string]interface{})
+						if m["type"].(string) == "image" {
+							src := m["url"].(string)
+							src = fmt.Sprintf("https://www.washingtonpost.com/wp-apps/imrs.php?src=%s&w=916", src)
+							alt := m["credits_caption_display"].(string)
+							fmt.Fprintf(w, `<img src="%s"><p>%s`, html.EscapeString(src), html.EscapeString(alt))
+							startingblob = i+1
+							return
+						}
+					}
+				}
+			}
+			writetag(w, node)
+		case contains(bannedtags, tag):
+			return
+		}
+	case html.TextNode:
+		io.WriteString(w, html.EscapeString(node.Data))
+	}
+	for c := node.FirstChild; c != nil; c = c.NextSibling {
+		clean(w, c, baseurl)
+	}
+	if node.Type == html.ElementNode {
+		tag := node.Data
+		if tag == "a" || (contains(permittedtags, tag) && tag != "br") {
+			fmt.Fprintf(w, "</%s>", tag)
+		}
+		if tag == "p" || tag == "div" {
+			io.WriteString(w, "\n")
+		}
+	}
+}
+
author	Ted Unangst <tedu@tedunangst.com>	2022-03-26 01:46:55 -0400
committer	Ted Unangst <tedu@tedunangst.com>	2022-03-26 01:46:55 -0400
commit	a5667801a3282cc7ca04eea670986a75b824bef2 (patch)
tree	dd6c6f36fc18263c7dc9d4275f848149b68e8dfc
parent	8460b97bff37c577973d5c3f8e023746a67fc0f8 (diff)