/* Copyright (c) 2003-4, WebThing Ltd Author: Nick Kew This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* Note to Users You are requested to register as a user, at http://apache.webthing.com/registration.html This entitles you to support from the developer (see the webpage for details). I'm unlikely to reply to help/support requests from non-registered users, unless you're paying and/or offering constructive feedback such as bug reports or sensible suggestions for further development. It also makes a small contribution to the effort that's gone into developing this work. */ /* libxml */ #include /* apache */ #include #include #include #include module AP_MODULE_DECLARE_DATA proxy_html_module ; typedef struct urlmap { struct urlmap* next ; const char* from ; const char* to ; } urlmap ; typedef struct { urlmap* map ; const char* doctype ; const char* etag ; unsigned int flags ; } proxy_html_conf ; typedef struct { htmlSAXHandlerPtr sax ; ap_filter_t* f ; proxy_html_conf* cfg ; htmlParserCtxtPtr parser ; apr_bucket_brigade* bb ; xmlCharEncoding enc ; } saxctxt ; static int is_empty_elt(const char* name) { const char** p ; static const char* empty_elts[] = { "br" , "link" , "img" , "hr" , "input" , "meta" , "base" , "area" , "param" , "col" , "frame" , "isindex" , "basefont" , NULL } ; for ( p = empty_elts ; *p ; ++p ) if ( !strcmp( *p, name) ) return 1 ; return 0 ; } typedef struct { const char* name ; const char** attrs ; } elt_t ; #define NORM_LC 0x1 #define NORM_MSSLASH 0x2 #define NORM_RESET 0x4 static char* normalise(unsigned int flags, char* str) { xmlChar* p ; if ( flags & NORM_LC ) for ( p = str ; *p ; ++p ) if ( isupper(*p) ) *p = tolower(*p) ; if ( flags & NORM_MSSLASH ) for ( p = strchr(str, '\\') ; p ; p = strchr(p+1, '\\') ) *p = '/' ; return str ; } static void pstartElement(void* ctxt, const xmlChar* name, const xmlChar** attrs ) { saxctxt* ctx = (saxctxt*) ctxt ; static const char* href[] = { "href", NULL } ; static const char* cite[] = { "cite", NULL } ; static const char* action[] = { "action", NULL } ; static const char* imgattr[] = { "src", "longdesc", "usemap", NULL } ; static const char* inputattr[] = { "src", "usemap", NULL } ; static const char* scriptattr[] = { "src", "for", NULL } ; static const char* frameattr[] = { "src", "longdesc", NULL } ; static const char* objattr[] = { "classid", "codebase", "data", "usemap", NULL } ; static const char* profile[] = { "profile", NULL } ; static const char* background[] = { "background", NULL } ; static const char* codebase[] = { "codebase", NULL } ; static elt_t linked_elts[] = { { "a" , href } , { "form", action } , { "base" , href } , { "area" , href } , { "link" , href } , { "img" , imgattr } , { "input" , inputattr } , { "script" , scriptattr } , { "frame", frameattr } , { "iframe", frameattr } , { "object", objattr } , { "q" , cite } , { "blockquote" , cite } , { "ins" , cite } , { "del" , cite } , { "head" , profile } , { "body" , background } , { "applet", codebase } , { NULL, NULL } } ; ap_fputc(ctx->f->next, ctx->bb, '<') ; ap_fputs(ctx->f->next, ctx->bb, name) ; if ( attrs ) { const char** linkattrs = 0 ; const xmlChar** a ; elt_t* elt ; for ( elt = linked_elts; elt->name != NULL ; ++elt ) if ( !strcmp(elt->name, name) ) { linkattrs = elt->attrs ; break ; } for ( a = attrs ; *a ; a += 2 ) { const xmlChar* value = a[1] ; if ( linkattrs && value ) { int is_uri = 0 ; const char** linkattr = linkattrs ; do { if ( !strcmp(*linkattr, *a) ) { is_uri = 1 ; break ; } } while ( *++linkattr ) ; if ( is_uri ) { urlmap* m ; for ( m = ctx->cfg->map ; m ; m = m->next ) { if ( ! strncasecmp(value, m->from, strlen(m->from) ) ) { value = apr_pstrcat(ctx->f->r->pool, m->to, value+strlen(m->from) , NULL) ; break ; } } } } if ( ! value ) ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL) ; else { if ( ctx->cfg->flags != 0 ) value = normalise(ctx->cfg->flags, apr_pstrdup(ctx->f->r->pool, value ) ) ; ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", value, "\"", NULL) ; } } } if ( is_empty_elt(name) ) ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag) ; else ap_fputc(ctx->f->next, ctx->bb, '>') ; } static void pendElement(void* ctxt, const xmlChar* name) { saxctxt* ctx = (saxctxt*) ctxt ; if ( ! is_empty_elt(name) ) ap_fprintf(ctx->f->next, ctx->bb, "", name) ; } #define FLUSH ap_fwrite(ctx->f->next, ctx->bb, (chars+begin), (i-begin)) ; begin = i+1 static void pcharacters(void* ctxt, const xmlChar *chars, int length) { saxctxt* ctx = (saxctxt*) ctxt ; int i ; int begin ; for ( begin=i=0; if->next, ctx->bb, "&") ; break ; case '<' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "<") ; break ; case '>' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, ">") ; break ; case '"' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, """) ; break ; default : break ; } } FLUSH ; } static void pcdata(void* ctxt, const xmlChar *chars, int length) { saxctxt* ctx = (saxctxt*) ctxt ; ap_fwrite(ctx->f->next, ctx->bb, chars, length) ; } static void pcomment(void* ctxt, const xmlChar *chars) { saxctxt* ctx = (saxctxt*) ctxt ; ap_fputstrs(ctx->f->next, ctx->bb, "", NULL) ; } static htmlSAXHandlerPtr setupSAX(apr_pool_t* pool) { htmlSAXHandlerPtr sax = apr_pcalloc(pool, sizeof(htmlSAXHandler) ) ; sax->startDocument = NULL ; sax->endDocument = NULL ; sax->startElement = pstartElement ; sax->endElement = pendElement ; sax->characters = pcharacters ; sax->comment = pcomment ; sax->cdataBlock = pcdata ; return sax ; } static char* ctype2encoding(apr_pool_t* pool, const char* in) { char* x ; char* ptr ; char* ctype ; if ( ! in ) return 0 ; if ( ctype = strdup(in) , ! ctype ) return 0 ; for ( ptr = ctype ; *ptr; ++ptr) if ( isupper(*ptr) ) *ptr = tolower(*ptr) ; if ( ptr = strstr(ctype, "charset=") , ptr > 0 ) { ptr += 8 ; // jump over "charset=" and chop anything that follows charset if ( x = strchr(ptr, ' ') , x ) *x = 0 ; if ( x = strchr(ptr, ';') , x ) *x = 0 ; } x = ptr ? apr_pstrdup(pool, ptr) : 0 ; free (ctype ) ; return x ; } static int proxy_html_filter_init(ap_filter_t* f) { saxctxt* fctx ; /* remove content-length filter */ ap_filter_rec_t* clf = ap_get_output_filter_handle("CONTENT_LENGTH") ; ap_filter_t* ff = f->next ; do { ap_filter_t* fnext = ff->next ; if ( ff->frec == clf ) ap_remove_output_filter(ff) ; ff = fnext ; } while ( ff ) ; fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt)) ; fctx->sax = setupSAX(f->r->pool) ; fctx->f = f ; fctx->bb = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc) ; fctx->cfg = ap_get_module_config(f->r->per_dir_config,&proxy_html_module); /* Note the encoding now, before updating content-type */ fctx->enc = xmlParseCharEncoding (ctype2encoding(f->r->pool, f->r->content_type)) ; if ( f->r->proto_num >= 1001 ) { if ( ! f->r->main && ! f->r->prev ) f->r->chunked = 1 ; } apr_table_unset(f->r->headers_out, "Content-Length") ; apr_table_unset(f->r->headers_out, "ETag") ; ap_set_content_type(f->r, "text/html;charset=utf-8") ; ap_fputs(f->next, fctx->bb, fctx->cfg->doctype) ; return OK ; } static saxctxt* check_filter_init (ap_filter_t* f) { if ( f->r->proxyreq && f->r->content_type ) { if ( strncasecmp(f->r->content_type, "text/html", 9) && strncasecmp(f->r->content_type, "application/xhtml+xml", 21) ) { ap_remove_output_filter(f) ; return NULL ; } } if ( ! f->ctx ) proxy_html_filter_init(f) ; return f->ctx ; } static int proxy_html_filter(ap_filter_t* f, apr_bucket_brigade* bb) { apr_bucket* b ; const char* buf = 0 ; apr_size_t bytes = 0 ; saxctxt* ctxt = check_filter_init(f) ; if ( ! ctxt ) return ap_pass_brigade(f->next, bb) ; for ( b = APR_BRIGADE_FIRST(bb) ; b != APR_BRIGADE_SENTINEL(bb) ; b = APR_BUCKET_NEXT(b) ) { if ( APR_BUCKET_IS_EOS(b) ) { if ( ctxt->parser != NULL ) { htmlParseChunk(ctxt->parser, buf, 0, 1) ; htmlFreeParserCtxt(ctxt->parser) ; } APR_BRIGADE_INSERT_TAIL(ctxt->bb, apr_bucket_eos_create(ctxt->bb->bucket_alloc) ) ; ap_pass_brigade(ctxt->f->next, ctxt->bb) ; } else if ( apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ) == APR_SUCCESS ) { if ( ctxt->parser == NULL ) ctxt->parser = htmlCreatePushParserCtxt(ctxt->sax, ctxt, buf, bytes, 0, ctxt->enc) ; else htmlParseChunk(ctxt->parser, buf, bytes, 0) ; } else { ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, "Error in bucket read") ; } } //ap_fflush(ctxt->f->next, ctxt->bb) ; // uncomment for debug apr_brigade_destroy(bb) ; return APR_SUCCESS ; } static const char* fpi_html = "\n" ; static const char* fpi_html_legacy = "\n" ; static const char* fpi_xhtml = "\n" ; static const char* fpi_xhtml_legacy = "\n" ; static const char* html_etag = ">" ; static const char* xhtml_etag = " />" ; #define DEFAULT_DOCTYPE fpi_html #define DEFAULT_ETAG html_etag static void* proxy_html_config(apr_pool_t* pool, char* x) { proxy_html_conf* ret = apr_pcalloc(pool, sizeof(proxy_html_conf) ) ; ret->doctype = DEFAULT_DOCTYPE ; ret->etag = DEFAULT_ETAG ; return ret ; } static void* proxy_html_merge(apr_pool_t* pool, void* BASE, void* ADD) { proxy_html_conf* base = (proxy_html_conf*) BASE ; proxy_html_conf* add = (proxy_html_conf*) ADD ; proxy_html_conf* conf = apr_palloc(pool, sizeof(proxy_html_conf)) ; if ( add->map && base->map ) { urlmap* a ; conf->map = NULL ; for ( a = base->map ; a ; a = a->next ) { urlmap* save = conf->map ; conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ; conf->map->next = save ; } for ( a = add->map ; a ; a = a->next ) { urlmap* save = conf->map ; conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ; conf->map->next = save ; } } else conf->map = add->map ? add->map : base->map ; conf->doctype = ( add->doctype == DEFAULT_DOCTYPE ) ? base->doctype : add->doctype ; conf->etag = ( add->etag == DEFAULT_ETAG ) ? base->etag : add->etag ; if ( add->flags & NORM_RESET ) conf->flags = add->flags ^ NORM_RESET ; else conf->flags = base->flags | add->flags ; return conf ; } static const char* set_urlmap(cmd_parms* cmd, void* CFG, const char* from, const char* to) { proxy_html_conf* cfg = (proxy_html_conf*)CFG ; urlmap* oldmap = cfg->map ; urlmap* newmap = apr_palloc(cmd->pool, sizeof(urlmap) ) ; newmap->from = apr_pstrdup(cmd->pool, from) ; newmap->to = apr_pstrdup(cmd->pool, to) ; newmap->next = NULL ; if ( oldmap ) { while ( oldmap->next ) oldmap = oldmap->next ; oldmap->next = newmap ; } else cfg->map = newmap ; return NULL ; } static const char* set_doctype(cmd_parms* cmd, void* CFG, const char* t, const char* l) { proxy_html_conf* cfg = (proxy_html_conf*)CFG ; if ( !strcasecmp(t, "xhtml") ) { cfg->etag = xhtml_etag ; if ( l && !strcasecmp(l, "legacy") ) cfg->doctype = fpi_xhtml_legacy ; else cfg->doctype = fpi_xhtml ; } else if ( !strcasecmp(t, "html") ) { cfg->etag = html_etag ; if ( l && !strcasecmp(l, "legacy") ) cfg->doctype = fpi_html_legacy ; else cfg->doctype = fpi_html ; } else { cfg->doctype = apr_pstrdup(cmd->pool, t) ; if ( l && ( ( l[0] == 'x' ) || ( l[0] == 'X' ) ) ) cfg->etag = xhtml_etag ; } return NULL ; } static void set_param(proxy_html_conf* cfg, const char* arg) { if ( arg && *arg ) if ( !strcmp(arg, "lowercase") ) cfg->flags |= NORM_LC ; else if ( !strcmp(arg, "dospath") ) cfg->flags |= NORM_MSSLASH ; else if ( !strcmp(arg, "reset") ) cfg->flags |= NORM_RESET ; } static const char* set_flags(cmd_parms* cmd, void* CFG, const char* arg1, const char* arg2, const char* arg3) { set_param( (proxy_html_conf*)CFG, arg1) ; set_param( (proxy_html_conf*)CFG, arg2) ; set_param( (proxy_html_conf*)CFG, arg3) ; return NULL ; } static const command_rec proxy_html_cmds[] = { AP_INIT_TAKE2("ProxyHTMLURLMap", set_urlmap, NULL, OR_ALL, "Map URL From To" ) , AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL, OR_ALL, "(HTML|XHTML) [Legacy]" ) , AP_INIT_TAKE123("ProxyHTMLFixups", set_flags, NULL, OR_ALL, "Options are lowercase, dospath" ) , { NULL } } ; static void proxy_html_hooks(apr_pool_t* p) { ap_register_output_filter("proxy-html", proxy_html_filter, NULL, AP_FTYPE_RESOURCE) ; } module AP_MODULE_DECLARE_DATA proxy_html_module = { STANDARD20_MODULE_STUFF, proxy_html_config, proxy_html_merge, NULL, NULL, proxy_html_cmds, proxy_html_hooks } ;