diff options
author | Ted Unangst <tedu@tedunangst.com> | 2019-08-13 13:31:08 -0400 |
---|---|---|
committer | Ted Unangst <tedu@tedunangst.com> | 2019-08-13 13:31:08 -0400 |
commit | 73f6f6c19def829aeb79b1d2b91449f81f4fc67b (patch) | |
tree | c3c5abaa27425397e2ec3e8582928d9bf20774e8 | |
parent | cd6ba9eefd8e41cf42299266d40df1d9e025628f (diff) |
rework filtering to provide some more features. try to document some of it even.
-rw-r--r-- | README | 2 | ||||
-rw-r--r-- | filter.go | 9 | ||||
-rw-r--r-- | filter.lua | 22 | ||||
-rw-r--r-- | luafunctions.go | 47 | ||||
-rw-r--r-- | manual.txt | 74 | ||||
-rw-r--r-- | miniwebproxy.go | 11 |
6 files changed, 144 insertions, 21 deletions
diff --git a/README b/README index 1f15211..68816ee 100644 --- a/README +++ b/README @@ -12,6 +12,8 @@ Rewrite engine handles some common special cases like github and twitter embeds. Filter rules loaded on the fly for easy reconfiguration. +Refer to manual.txt for more. + -- requirements github.com/andybalholm/cascadia diff --git a/filter.go b/filter.go index eebee8e..8286a65 100644 --- a/filter.go +++ b/filter.go @@ -190,7 +190,7 @@ func insertframecontents(w io.Writer, node *html.Node, baseurl *url.URL) { for _, script := range scripts { procscriptnode(w, script, baseurl) } - for _, selname := range []string { "blockquote", "div.h-card", "div.e-content" } { + for _, selname := range []string{"blockquote", "div.h-card", "div.e-content"} { sel := cascadia.MustCompile(selname) divs := sel.MatchAll(root) for _, d := range divs { @@ -294,6 +294,11 @@ func clean(w io.Writer, node *html.Node, baseurl *url.URL) { } func filter(L *Interpreter, w io.Writer, resp *http.Response, req *http.Request) { + filtresult := L.Call("respfilter", req, resp, w) + if filtresult != "" { + log.Printf("respfilter says %s", filtresult) + return + } if resp.StatusCode != 200 { resp.Write(w) return @@ -359,7 +364,7 @@ func dofiltering(L *Interpreter, under io.WriteCloser, w flushWriter, reader io. return } - filtresult := L.Call("filter", w, req, req.URL.Hostname(), req.URL.Path, root) + filtresult := L.Call("htmlfilter", w, req, req.URL.Hostname(), req.URL.Path, root) if filtresult == "" { log.Printf("did not rewrite %s, raw rendering", req.URL.String()) html.Render(w, root) diff --git a/filter.lua b/filter.lua index fe58d94..7cc7256 100644 --- a/filter.lua +++ b/filter.lua @@ -26,6 +26,12 @@ local annoyances = { cssfilter(".newsletter-signup"), } +function newconnection(req) +end + +function endconnection() +end + function shouldfilter(host) if host:match("github.com") then return "" end if host:match("amazon.com") then return "" end @@ -40,7 +46,21 @@ function prefilter(req) reqsethost(req, host) end -function filter(outw, req, host, path, root) +local dislikeCF = false + +function respfilter(req, resp, outw) + if dislikeCF then + local serv = respgetheader(resp, "Server") + if serv == "cloudflare" then + writestring(outw, "HTTP/1.1 302 Moved Temporarily\r\n") + writestring(outw, "Location: https://web.archive.org/save/" .. reqgethost(req) .. + reqgetpath(req) .. "\r\n\r\n") + return "done" + end + end +end + +function htmlfilter(outw, req, host, path, root) print("trying to filter in lua", host, path) if true and path == "/" then return "" diff --git a/luafunctions.go b/luafunctions.go index 3310c5f..7797522 100644 --- a/luafunctions.go +++ b/luafunctions.go @@ -25,39 +25,51 @@ import ( "golang.org/x/net/html" ) -func cssfilter(css string) cascadia.Selector { +func lcssfilter(css string) cascadia.Selector { return cascadia.MustCompile(css) } -func matchfirst(node *html.Node, sel cascadia.Selector) *html.Node { +func lmatchfirst(node *html.Node, sel cascadia.Selector) *html.Node { return sel.MatchFirst(node) } -func matchall(node *html.Node, sel cascadia.Selector) []*html.Node { +func lmatchall(node *html.Node, sel cascadia.Selector) []*html.Node { return sel.MatchAll(node) } -func removenode(node *html.Node) { +func lremovenode(node *html.Node) { node.Parent.RemoveChild(node) } -func writestring(w io.Writer, s string) { +func lwritestring(w io.Writer, s string) { io.WriteString(w, s) } -func reqgethost(req *http.Request) string { +func lreqgethost(req *http.Request) string { return req.URL.Host } -func reqsethost(req *http.Request, host string) { +func lreqsethost(req *http.Request, host string) { req.URL.Host = host } +func lreqgetpath(req *http.Request) string { + return req.URL.Path +} + +func lreqsetpath(req *http.Request, path string) { + req.URL.Path = path +} + +func lrespgetheader(resp *http.Response, name string) string { + return resp.Header.Get(name) +} + func lclean(w io.Writer, node *html.Node, req *http.Request) { clean(w, node, req.URL) } -func savehtml(filename string, node *html.Node) string { +func lsavehtml(filename string, node *html.Node) string { fd, err := os.Create(filename) if err != nil { return err.Error() @@ -68,13 +80,16 @@ func savehtml(filename string, node *html.Node) string { } func addfunctions(L *lua.LState) { - L.SetGlobal("cssfilter", L.NewFunction(func2lua(cssfilter))) - L.SetGlobal("matchfirst", L.NewFunction(func2lua(matchfirst))) - L.SetGlobal("matchall", L.NewFunction(func2lua(matchall))) - L.SetGlobal("removenode", L.NewFunction(func2lua(removenode))) + L.SetGlobal("cssfilter", L.NewFunction(func2lua(lcssfilter))) + L.SetGlobal("matchfirst", L.NewFunction(func2lua(lmatchfirst))) + L.SetGlobal("matchall", L.NewFunction(func2lua(lmatchall))) + L.SetGlobal("removenode", L.NewFunction(func2lua(lremovenode))) L.SetGlobal("clean", L.NewFunction(func2lua(lclean))) - L.SetGlobal("savehtml", L.NewFunction(func2lua(savehtml))) - L.SetGlobal("writestring", L.NewFunction(func2lua(writestring))) - L.SetGlobal("reqgethost", L.NewFunction(func2lua(reqgethost))) - L.SetGlobal("reqsethost", L.NewFunction(func2lua(reqsethost))) + L.SetGlobal("savehtml", L.NewFunction(func2lua(lsavehtml))) + L.SetGlobal("writestring", L.NewFunction(func2lua(lwritestring))) + L.SetGlobal("reqgethost", L.NewFunction(func2lua(lreqgethost))) + L.SetGlobal("reqsethost", L.NewFunction(func2lua(lreqsethost))) + L.SetGlobal("reqgetpath", L.NewFunction(func2lua(lreqgetpath))) + L.SetGlobal("reqsetpath", L.NewFunction(func2lua(lreqsetpath))) + L.SetGlobal("respgetheader", L.NewFunction(func2lua(lrespgetheader))) } diff --git a/manual.txt b/manual.txt new file mode 100644 index 0000000..0e7e7eb --- /dev/null +++ b/manual.txt @@ -0,0 +1,74 @@ + +Introduction to filtering + +miniwebproxy handles the networking and HTTP aspects of web connections. It +provides user control over the responses via callbacks written in lua. The +filtering engine is designed and optimized for rewriting and simplyfing HTML +content, but it can perform some other operations as well. + +The filters live in filter.lua. + +Callbacks + +newconnection(req) +Called after reading each request, before forwarding. Typically does nothing. + +endconnection() +Called after everything is done. Typically does nothing. + +shouldfilter(host) string +Called to determine if filtering should be skipped. Typically returns "y". +Some sites, such as git front ends, lose too much user interface after +aggressive filtering. They can be skipped by return an empty string, "". + +prefilter(req) +Allows rewriting the request. If you prefer old reddit to new reddit: + local host = reqgethost(req) + host = host:gsub("www.reddit.com", "old.reddit.com") + reqsethost(req, host) + +respfilter(req, resp, outw) string +Allows lowlevel rewriting of the response, before other processing. Tor users +tired of Cloudflare captchas may try this to redirect to a web archive +instead. Typically doesn't do much because it's too lowlevel. +Returns a nonempty string to prevent further processing of the request. +outw is the raw HTTP stream. + +htmlfilter(outw, req, host, path, root) string +The fun part. The root of the parsed HTML document is provided for filtering +and reduction. See below for available functions. +An empty string return indicates that filtering failed, and the original +response should be sent to the browser. +outw is the response body (an HTML document). + +Available Functions + +At the current time, everything is pure imperative, no OO. + +reqgethost(req), reqgetpath(req) - get the host and path of a request +reqsethost(req, host), reqsetpath(req, path) - set the host and path of a request +respgetheader(resp, name) - get a header from the response + +writestring(w, string) - write a string to the output (the browser) + +cssfilter(string) selector - compile a new css selector + +matchall(node, selector) - return an array of all matching document nodes +matchfirst(node, selector) - return just the first matching node, or nil + +removenode(node) - remove a node from the document + +Examples + +There's a fairly complete filter provided in filter.lua by default. + +To remove all comments from an article, assuming they are individual div +elements with a class of comment. + + local sel = cssfilter("div.comment") + local comms = matchall(article, sel) + for i = 1, #comms do + removenode(comms[i]) + end + + diff --git a/miniwebproxy.go b/miniwebproxy.go index b6ea959..11ba199 100644 --- a/miniwebproxy.go +++ b/miniwebproxy.go @@ -69,6 +69,11 @@ func getDialer() contextDialer { return &net.Dialer{} } +func finishinterpreter(L *Interpreter) { + defer putinterpreter(L) + L.Call("endconnection") +} + // this should not be different from the TLS intercept proxy func proxyreq(w http.ResponseWriter, r *http.Request) { deadline := time.Now().Add(1 * time.Minute) @@ -106,7 +111,8 @@ func proxyreq(w http.ResponseWriter, r *http.Request) { } L := getinterpreter() - defer putinterpreter(L) + defer finishinterpreter(L) + L.Call("newconnection", r) if !shouldintercept(L, r.URL.Hostname()) { resp.Write(clientconn) return @@ -264,7 +270,8 @@ func (pxr *Proxer) ServeHTTP(w http.ResponseWriter, r *http.Request) { } L := getinterpreter() - defer putinterpreter(L) + defer finishinterpreter(L) + L.Call("newconnection", r) if !shouldintercept(L, desthost) { serverconn, err := connect(ctx, dest, "") if err != nil { |