summary refs log tree commit diff
diff options
context:
space:
mode:
authorTed Unangst <tedu@tedunangst.com>2019-08-13 13:31:08 -0400
committerTed Unangst <tedu@tedunangst.com>2019-08-13 13:31:08 -0400
commit73f6f6c19def829aeb79b1d2b91449f81f4fc67b (patch)
treec3c5abaa27425397e2ec3e8582928d9bf20774e8
parentcd6ba9eefd8e41cf42299266d40df1d9e025628f (diff)
rework filtering to provide some more features. try to document some of it even.
-rw-r--r--README2
-rw-r--r--filter.go9
-rw-r--r--filter.lua22
-rw-r--r--luafunctions.go47
-rw-r--r--manual.txt74
-rw-r--r--miniwebproxy.go11
6 files changed, 144 insertions, 21 deletions
diff --git a/README b/README
index 1f15211..68816ee 100644
--- a/README
+++ b/README
@@ -12,6 +12,8 @@ Rewrite engine handles some common special cases like github and twitter embeds.
 
 Filter rules loaded on the fly for easy reconfiguration.
 
+Refer to manual.txt for more.
+
 -- requirements
 
 github.com/andybalholm/cascadia
diff --git a/filter.go b/filter.go
index eebee8e..8286a65 100644
--- a/filter.go
+++ b/filter.go
@@ -190,7 +190,7 @@ func insertframecontents(w io.Writer, node *html.Node, baseurl *url.URL) {
 		for _, script := range scripts {
 			procscriptnode(w, script, baseurl)
 		}
-		for _, selname := range []string { "blockquote", "div.h-card", "div.e-content" } {
+		for _, selname := range []string{"blockquote", "div.h-card", "div.e-content"} {
 			sel := cascadia.MustCompile(selname)
 			divs := sel.MatchAll(root)
 			for _, d := range divs {
@@ -294,6 +294,11 @@ func clean(w io.Writer, node *html.Node, baseurl *url.URL) {
 }
 
 func filter(L *Interpreter, w io.Writer, resp *http.Response, req *http.Request) {
+	filtresult := L.Call("respfilter", req, resp, w)
+	if filtresult != "" {
+		log.Printf("respfilter says %s", filtresult)
+		return
+	}
 	if resp.StatusCode != 200 {
 		resp.Write(w)
 		return
@@ -359,7 +364,7 @@ func dofiltering(L *Interpreter, under io.WriteCloser, w flushWriter, reader io.
 		return
 	}
 
-	filtresult := L.Call("filter", w, req, req.URL.Hostname(), req.URL.Path, root)
+	filtresult := L.Call("htmlfilter", w, req, req.URL.Hostname(), req.URL.Path, root)
 	if filtresult == "" {
 		log.Printf("did not rewrite %s, raw rendering", req.URL.String())
 		html.Render(w, root)
diff --git a/filter.lua b/filter.lua
index fe58d94..7cc7256 100644
--- a/filter.lua
+++ b/filter.lua
@@ -26,6 +26,12 @@ local annoyances = {
 	cssfilter(".newsletter-signup"),
 }
 
+function newconnection(req)
+end
+
+function endconnection()
+end
+
 function shouldfilter(host)
 	if host:match("github.com") then return "" end
 	if host:match("amazon.com") then return "" end
@@ -40,7 +46,21 @@ function prefilter(req)
 	reqsethost(req, host)
 end
 
-function filter(outw, req, host, path, root)
+local dislikeCF = false
+
+function respfilter(req, resp, outw)
+	if dislikeCF then
+		local serv = respgetheader(resp, "Server")
+		if serv == "cloudflare" then
+			writestring(outw, "HTTP/1.1 302 Moved Temporarily\r\n")
+			writestring(outw, "Location: https://web.archive.org/save/" .. reqgethost(req) ..
+				reqgetpath(req) .. "\r\n\r\n")
+			return "done"
+		end
+	end
+end
+
+function htmlfilter(outw, req, host, path, root)
 	print("trying to filter in lua", host, path)
 	if true and path == "/" then
 		return ""
diff --git a/luafunctions.go b/luafunctions.go
index 3310c5f..7797522 100644
--- a/luafunctions.go
+++ b/luafunctions.go
@@ -25,39 +25,51 @@ import (
 	"golang.org/x/net/html"
 )
 
-func cssfilter(css string) cascadia.Selector {
+func lcssfilter(css string) cascadia.Selector {
 	return cascadia.MustCompile(css)
 }
 
-func matchfirst(node *html.Node, sel cascadia.Selector) *html.Node {
+func lmatchfirst(node *html.Node, sel cascadia.Selector) *html.Node {
 	return sel.MatchFirst(node)
 }
 
-func matchall(node *html.Node, sel cascadia.Selector) []*html.Node {
+func lmatchall(node *html.Node, sel cascadia.Selector) []*html.Node {
 	return sel.MatchAll(node)
 }
 
-func removenode(node *html.Node) {
+func lremovenode(node *html.Node) {
 	node.Parent.RemoveChild(node)
 }
 
-func writestring(w io.Writer, s string) {
+func lwritestring(w io.Writer, s string) {
 	io.WriteString(w, s)
 }
 
-func reqgethost(req *http.Request) string {
+func lreqgethost(req *http.Request) string {
 	return req.URL.Host
 }
 
-func reqsethost(req *http.Request, host string) {
+func lreqsethost(req *http.Request, host string) {
 	req.URL.Host = host
 }
 
+func lreqgetpath(req *http.Request) string {
+	return req.URL.Path
+}
+
+func lreqsetpath(req *http.Request, path string) {
+	req.URL.Path = path
+}
+
+func lrespgetheader(resp *http.Response, name string) string {
+	return resp.Header.Get(name)
+}
+
 func lclean(w io.Writer, node *html.Node, req *http.Request) {
 	clean(w, node, req.URL)
 }
 
-func savehtml(filename string, node *html.Node) string {
+func lsavehtml(filename string, node *html.Node) string {
 	fd, err := os.Create(filename)
 	if err != nil {
 		return err.Error()
@@ -68,13 +80,16 @@ func savehtml(filename string, node *html.Node) string {
 }
 
 func addfunctions(L *lua.LState) {
-	L.SetGlobal("cssfilter", L.NewFunction(func2lua(cssfilter)))
-	L.SetGlobal("matchfirst", L.NewFunction(func2lua(matchfirst)))
-	L.SetGlobal("matchall", L.NewFunction(func2lua(matchall)))
-	L.SetGlobal("removenode", L.NewFunction(func2lua(removenode)))
+	L.SetGlobal("cssfilter", L.NewFunction(func2lua(lcssfilter)))
+	L.SetGlobal("matchfirst", L.NewFunction(func2lua(lmatchfirst)))
+	L.SetGlobal("matchall", L.NewFunction(func2lua(lmatchall)))
+	L.SetGlobal("removenode", L.NewFunction(func2lua(lremovenode)))
 	L.SetGlobal("clean", L.NewFunction(func2lua(lclean)))
-	L.SetGlobal("savehtml", L.NewFunction(func2lua(savehtml)))
-	L.SetGlobal("writestring", L.NewFunction(func2lua(writestring)))
-	L.SetGlobal("reqgethost", L.NewFunction(func2lua(reqgethost)))
-	L.SetGlobal("reqsethost", L.NewFunction(func2lua(reqsethost)))
+	L.SetGlobal("savehtml", L.NewFunction(func2lua(lsavehtml)))
+	L.SetGlobal("writestring", L.NewFunction(func2lua(lwritestring)))
+	L.SetGlobal("reqgethost", L.NewFunction(func2lua(lreqgethost)))
+	L.SetGlobal("reqsethost", L.NewFunction(func2lua(lreqsethost)))
+	L.SetGlobal("reqgetpath", L.NewFunction(func2lua(lreqgetpath)))
+	L.SetGlobal("reqsetpath", L.NewFunction(func2lua(lreqsetpath)))
+	L.SetGlobal("respgetheader", L.NewFunction(func2lua(lrespgetheader)))
 }
diff --git a/manual.txt b/manual.txt
new file mode 100644
index 0000000..0e7e7eb
--- /dev/null
+++ b/manual.txt
@@ -0,0 +1,74 @@
+
+Introduction to filtering
+
+miniwebproxy handles the networking and HTTP aspects of web connections. It
+provides user control over the responses via callbacks written in lua. The
+filtering engine is designed and optimized for rewriting and simplyfing HTML
+content, but it can perform some other operations as well.
+
+The filters live in filter.lua.
+
+Callbacks
+
+newconnection(req)
+Called after reading each request, before forwarding. Typically does nothing.
+
+endconnection()
+Called after everything is done. Typically does nothing.
+
+shouldfilter(host) string
+Called to determine if filtering should be skipped. Typically returns "y".
+Some sites, such as git front ends, lose too much user interface after
+aggressive filtering. They can be skipped by return an empty string, "".
+
+prefilter(req)
+Allows rewriting the request. If you prefer old reddit to new reddit:
+	local host = reqgethost(req)
+	host = host:gsub("www.reddit.com", "old.reddit.com")
+	reqsethost(req, host)
+
+respfilter(req, resp, outw) string
+Allows lowlevel rewriting of the response, before other processing. Tor users
+tired of Cloudflare captchas may try this to redirect to a web archive
+instead. Typically doesn't do much because it's too lowlevel.
+Returns a nonempty string to prevent further processing of the request.
+outw is the raw HTTP stream.
+
+htmlfilter(outw, req, host, path, root) string
+The fun part. The root of the parsed HTML document is provided for filtering
+and reduction. See below for available functions.
+An empty string return indicates that filtering failed, and the original
+response should be sent to the browser.
+outw is the response body (an HTML document).
+
+Available Functions
+
+At the current time, everything is pure imperative, no OO.
+
+reqgethost(req), reqgetpath(req) - get the host and path of a request
+reqsethost(req, host), reqsetpath(req, path) - set the host and path of a request
+respgetheader(resp, name) - get a header from the response
+
+writestring(w, string) - write a string to the output (the browser)
+
+cssfilter(string) selector - compile a new css selector
+
+matchall(node, selector) - return an array of all matching document nodes
+matchfirst(node, selector) - return just the first matching node, or nil
+
+removenode(node) - remove a node from the document
+
+Examples
+
+There's a fairly complete filter provided in filter.lua by default.
+
+To remove all comments from an article, assuming they are individual div
+elements with a class of comment.
+
+	local sel = cssfilter("div.comment")
+	local comms = matchall(article, sel)
+	for i = 1, #comms do
+		removenode(comms[i])
+	end
+
+
diff --git a/miniwebproxy.go b/miniwebproxy.go
index b6ea959..11ba199 100644
--- a/miniwebproxy.go
+++ b/miniwebproxy.go
@@ -69,6 +69,11 @@ func getDialer() contextDialer {
 	return &net.Dialer{}
 }
 
+func finishinterpreter(L *Interpreter) {
+	defer putinterpreter(L)
+	L.Call("endconnection")
+}
+
 // this should not be different from the TLS intercept proxy
 func proxyreq(w http.ResponseWriter, r *http.Request) {
 	deadline := time.Now().Add(1 * time.Minute)
@@ -106,7 +111,8 @@ func proxyreq(w http.ResponseWriter, r *http.Request) {
 	}
 
 	L := getinterpreter()
-	defer putinterpreter(L)
+	defer finishinterpreter(L)
+	L.Call("newconnection", r)
 	if !shouldintercept(L, r.URL.Hostname()) {
 		resp.Write(clientconn)
 		return
@@ -264,7 +270,8 @@ func (pxr *Proxer) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	}
 
 	L := getinterpreter()
-	defer putinterpreter(L)
+	defer finishinterpreter(L)
+	L.Call("newconnection", r)
 	if !shouldintercept(L, desthost) {
 		serverconn, err := connect(ctx, dest, "")
 		if err != nil {