/* * Copyright (c) 2002, Adam Dunkels. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This file is part of the Contiki desktop environment * * $Id: htmlparser.c,v 1.1 2006/06/17 22:41:13 adamdunkels Exp $ * */ /* htmlparser.c: * * Implements a very simplistic HTML parser. It recognizes HTML links * (-tags), HTML img alt tags, a few text flow break tags G * (
,

, ), the

  • tag (but does not even try to * distinguish between
      or
        ) as well as HTML comment tags * (). * * To save memory, the HTML parser is state machine driver, which * means that it will shave off one character from the HTML page, * process that character, and return to the next. Another way of * doing it would be to buffer a number of characters and process them * together. * * The main function in this file is the htmlparser_parse() function * which takes a htmlparser_state structur and a part of an HTML file * as an argument. The htmlparser_parse() function will call the * helper functions parse_char() and parse_tag(). Those functions will * in turn call the two callback functions htmlparser_char() and * htmlparser_tag(). Those functions must be implemented by the using * module (e.g., a web browser program). * * htmlparser_char() will be called for every non-tag character. * * htmlparser_tag() will be called whenever a full tag has been found. * */ #include "htmlparser.h" #include "html-strings.h" #include "contiki.h" #include #if 1 #define PRINTF(x) #else #include #define PRINTF(x) printf x #endif /*-----------------------------------------------------------------------------------*/ #define ISO_A 0x41 #define ISO_B 0x42 #define ISO_E 0x45 #define ISO_F 0x46 #define ISO_G 0x47 #define ISO_H 0x48 #define ISO_I 0x49 #define ISO_L 0x4c #define ISO_M 0x4d #define ISO_P 0x50 #define ISO_R 0x52 #define ISO_T 0x54 #define ISO_a (ISO_A | 0x20) #define ISO_b (ISO_B | 0x20) #define ISO_e (ISO_E | 0x20) #define ISO_f (ISO_F | 0x20) #define ISO_g (ISO_G | 0x20) #define ISO_h (ISO_H | 0x20) #define ISO_i (ISO_I | 0x20) #define ISO_l (ISO_L | 0x20) #define ISO_m (ISO_M | 0x20) #define ISO_p (ISO_P | 0x20) #define ISO_r (ISO_R | 0x20) #define ISO_t (ISO_T | 0x20) #define ISO_ht 0x09 #define ISO_nl 0x0a #define ISO_cr 0x0d #define ISO_space 0x20 #define ISO_bang 0x21 #define ISO_citation 0x22 #define ISO_ampersand 0x26 #define ISO_citation2 0x27 #define ISO_asterisk 0x2a #define ISO_dash 0x2d #define ISO_slash 0x2f #define ISO_semicolon 0x3b #define ISO_lt 0x3c #define ISO_eq 0x3d #define ISO_gt 0x3e #define ISO_rbrack 0x5b #define ISO_lbrack 0x5d #define MINORSTATE_NONE 0 #define MINORSTATE_TEXT 1 /* Parse normal text */ #define MINORSTATE_EXTCHAR 2 /* Check for semi-colon */ #define MINORSTATE_TAG 3 /* Check for name of tag. */ #define MINORSTATE_TAGEND 4 /* Scan for end of tag. */ #define MINORSTATE_TAGATTR 5 /* Parse tag attr. */ #define MINORSTATE_TAGATTRSPACE 6 /* Parse optional space after tag attr. */ #define MINORSTATE_TAGATTRPARAM 7 /* Parse tag attr parameter. */ #define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without quotation marks. */ #define MINORSTATE_HTMLCOMMENT 9 /* Scan for HTML comment end */ #define MAJORSTATE_NONE 0 #define MAJORSTATE_BODY 1 #define MAJORSTATE_LINK 2 #define MAJORSTATE_FORM 3 #define MAJORSTATE_DISCARD 4 struct htmlparser_state { unsigned char minorstate; char tag[20]; unsigned char tagptr; char tagattr[20]; unsigned char tagattrptr; char tagattrparam[WWW_CONF_MAX_URLLEN]; unsigned char tagattrparamptr; unsigned char lastchar, quotechar; unsigned char majorstate, lastmajorstate; char linkurl[WWW_CONF_MAX_URLLEN]; #define MAX_WORDLEN 40 char word[MAX_WORDLEN]; unsigned char wordlen; #if WWW_CONF_FORMS char formaction[WWW_CONF_MAX_FORMACTIONLEN]; char formname[WWW_CONF_MAX_FORMNAMELEN]; unsigned char inputtype; char inputname[WWW_CONF_MAX_INPUTNAMELEN]; char inputvalue[WWW_CONF_MAX_INPUTVALUELEN]; unsigned char inputvaluesize; #endif /* WWW_CONF_FORMS */ }; static struct htmlparser_state s; /*-----------------------------------------------------------------------------------*/ static char last[1] = {0xff}; static const char *tags[] = { #define TAG_FIRST 0 #define TAG_SLASHA 0 html_slasha, #define TAG_SLASHCENTER 1 html_slashcenter, #define TAG_SLASHFORM 2 html_slashform, #define TAG_SLASHH 3 html_slashh, #define TAG_SLASHSCRIPT 4 html_slashscript, #define TAG_SLASHSELECT 5 html_slashselect, #define TAG_SLASHSTYLE 6 html_slashstyle, #define TAG_A 7 html_a, #define TAG_BODY 8 html_body, #define TAG_BR 9 html_br, #define TAG_CENTER 10 html_center, #define TAG_FORM 11 html_form, #define TAG_FRAME 12 html_frame, #define TAG_H1 13 html_h1, #define TAG_H2 14 html_h2, #define TAG_H3 15 html_h3, #define TAG_H4 16 html_h4, #define TAG_IMG 17 html_img, #define TAG_INPUT 18 html_input, #define TAG_LI 19 html_li, #define TAG_P 20 html_p, #define TAG_SCRIPT 21 html_script, #define TAG_SELECT 22 html_select, #define TAG_STYLE 23 html_style, #define TAG_TR 24 html_tr, #define TAG_LAST 25 last, }; /*-----------------------------------------------------------------------------------*/ static unsigned char CC_FASTCALL iswhitespace(char c) { return (c == ISO_space || c == ISO_nl || c == ISO_cr || c == ISO_ht); } /*-----------------------------------------------------------------------------------*/ void htmlparser_init(void) { s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD; s.minorstate = MINORSTATE_TEXT; s.lastchar = 0; } /*-----------------------------------------------------------------------------------*/ static char CC_FASTCALL lowercase(char c) { /* XXX: This is a *brute force* approach to lower-case converting and should *not* be used anywhere else! It works for our purposes, however (i.e., HTML tags). */ if(c > 0x40) { return (c & 0x1f) | 0x60; } else { return c; } } /*-----------------------------------------------------------------------------------*/ static void endtagfound(void) { s.tag[s.tagptr] = 0; s.tagattr[s.tagattrptr] = 0; s.tagattrparam[s.tagattrparamptr] = 0; } /*-----------------------------------------------------------------------------------*/ static void CC_FASTCALL switch_majorstate(unsigned char newstate) { if(s.majorstate != newstate) { PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate)); s.lastmajorstate = s.majorstate; s.majorstate = newstate; } } /*-----------------------------------------------------------------------------------*/ static void CC_FASTCALL add_char(unsigned char c) { if(s.wordlen < MAX_WORDLEN && c < 0x80) { s.word[s.wordlen] = c; ++s.wordlen; if(s.wordlen == MAX_WORDLEN) { s.wordlen = MAX_WORDLEN - 1; } } } /*-----------------------------------------------------------------------------------*/ static void do_word(void) { if(s.wordlen > 0) { if(s.majorstate == MAJORSTATE_LINK) { if(s.word[s.wordlen] != ISO_space) { add_char(ISO_space); } } else if(s.majorstate == MAJORSTATE_DISCARD) { s.wordlen = 0; } else { s.word[s.wordlen] = '\0'; htmlparser_word(s.word, s.wordlen); s.wordlen = 0; } } } /*-----------------------------------------------------------------------------------*/ static void newline(void) { do_word(); htmlparser_newline(); } /*-----------------------------------------------------------------------------------*/ static unsigned char CC_FASTCALL find_tag(char *tag) { static unsigned char first, last, i, tabi; static char tagc; first = TAG_FIRST; last = TAG_LAST; i = 0; do { tagc = tag[i]; if(tagc == 0 && tags[first][i] == 0) { return first; } tabi = first; /* First, find first matching tag from table. */ while(tagc > (tags[tabi])[i] && tabi < last) { ++tabi; } first = tabi; /* Second, find last matching tag from table. */ while(tagc == (tags[tabi])[i] && tabi < last) { ++tabi; } last = tabi; /* If first and last matching tags are equal, we have a non-match and return. Else we continue with the next character. */ ++i; } while(last != first); return TAG_LAST; } /*-----------------------------------------------------------------------------------*/ static void parse_tag(void) { static char *tagattrparam; static unsigned char size, i; static char dummy; PRINTF(("Parsing tag '%s' '%s' '%s'\n", s.tag, s.tagattr, s.tagattrparam)); switch(find_tag(s.tag)) { case TAG_P: case TAG_H1: case TAG_H2: case TAG_H3: case TAG_H4: /* parse_char(ISO_nl);*/ newline(); /* FALLTHROUGH */ case TAG_BR: case TAG_TR: case TAG_SLASHH: /* parse_char(ISO_nl);*/ dummy = 0; newline(); break; case TAG_LI: newline(); add_char(ISO_asterisk); add_char(ISO_space); break; case TAG_SCRIPT: case TAG_STYLE: case TAG_SELECT: switch_majorstate(MAJORSTATE_DISCARD); break; case TAG_SLASHSCRIPT: case TAG_SLASHSTYLE: case TAG_SLASHSELECT: switch_majorstate(s.lastmajorstate); break; case TAG_BODY: s.majorstate = s.lastmajorstate = MAJORSTATE_BODY; break; case TAG_FRAME: if(strncmp(s.tagattr, html_src, sizeof(html_src)) == 0 && s.tagattrparam[0] != 0) { switch_majorstate(MAJORSTATE_BODY); newline(); add_char(ISO_rbrack); do_word(); htmlparser_link((char *)html_frame, strlen(html_frame), s.tagattrparam); PRINTF(("Frame [%s]\n", s.tagattrparam)); add_char(ISO_lbrack); newline(); } break; case TAG_IMG: if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 && s.tagattrparam[0] != 0) { /* parse_char(ISO_lt);*/ add_char(ISO_lt); tagattrparam = &s.tagattrparam[0]; while(*tagattrparam) { /* parse_char(*tagattrparam);*/ add_char(*tagattrparam); ++tagattrparam; } /* parse_char(ISO_gt);*/ add_char(ISO_gt); do_word(); } break; case TAG_A: PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam)); if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 && s.tagattrparam[0] != 0) { strcpy(s.linkurl, s.tagattrparam); do_word(); switch_majorstate(MAJORSTATE_LINK); } break; case TAG_SLASHA: if(s.majorstate == MAJORSTATE_LINK) { switch_majorstate(s.lastmajorstate); s.word[s.wordlen] = 0; htmlparser_link(s.word, s.wordlen, s.linkurl); s.wordlen = 0; } break; #if WWW_CONF_FORMS case TAG_FORM: PRINTF(("Form tag\n")); switch_majorstate(MAJORSTATE_FORM); if(strncmp(s.tagattr, html_action, sizeof(html_action)) == 0) { PRINTF(("Form action '%s'\n", s.tagattrparam)); strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1); } else if(strncmp(s.tagattr, html_name, sizeof(html_name)) == 0) { PRINTF(("Form name '%s'\n", s.tagattrparam)); strncpy(s.formname, s.tagattrparam, WWW_CONF_MAX_FORMNAMELEN - 1); } s.inputname[0] = s.inputvalue[0] = 0; break; case TAG_SLASHFORM: switch_majorstate(MAJORSTATE_BODY); s.formaction[0] = s.formname[0] = 0; break; case TAG_INPUT: if(s.majorstate == MAJORSTATE_FORM) { /* First check if we are called at the end of an input tag. If so, we should render the input widget. */ if(s.tagattr[0] == 0 && s.inputname[0] != 0) { PRINTF(("Render input type %d\n", s.inputtype)); switch(s.inputtype) { case HTMLPARSER_INPUTTYPE_NONE: case HTMLPARSER_INPUTTYPE_TEXT: for(i = 0; i < s.inputvaluesize; ++i) { if(s.inputvalue[i] == 0) { memset(&s.inputvalue[i], ISO_space, s.inputvaluesize - i); s.inputvalue[s.inputvaluesize] = 0; break; } } htmlparser_inputfield(s.inputvalue, s.inputname, s.formname, s.formaction); break; case HTMLPARSER_INPUTTYPE_SUBMIT: case HTMLPARSER_INPUTTYPE_IMAGE: htmlparser_submitbutton(s.inputvalue, s.inputname, s.formname, s.formaction); break; } s.inputtype = HTMLPARSER_INPUTTYPE_NONE; } else { PRINTF(("Input '%s' '%s'\n", s.tagattr, s.tagattrparam)); if(strncmp(s.tagattr, html_type, sizeof(html_type)) == 0) { if(strncmp(s.tagattrparam, html_submit, sizeof(html_submit)) == 0) { s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT; } else if(strncmp(s.tagattrparam, html_image, sizeof(html_image)) == 0) { s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE; } else if(strncmp(s.tagattrparam, html_text, sizeof(html_text)) == 0) { s.inputtype = HTMLPARSER_INPUTTYPE_TEXT; } else { s.inputtype = HTMLPARSER_INPUTTYPE_OTHER; } } else if(strncmp(s.tagattr, html_name, sizeof(html_name)) == 0) { strncpy(s.inputname, s.tagattrparam, WWW_CONF_MAX_INPUTNAMELEN); } else if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 && s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) { strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN); } else if(strncmp(s.tagattr, html_value, sizeof(html_value)) == 0) { strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN); } else if(strncmp(s.tagattr, html_size, sizeof(html_size)) == 0) { size = 0; if(s.tagattrparam[0] >= '0' && s.tagattrparam[0] <= '9') { size = s.tagattrparam[0] - '0'; if(s.tagattrparam[1] >= '0' && s.tagattrparam[1] <= '9') { size = size * 10 + (s.tagattrparam[1] - '0'); } } if(size >= WWW_CONF_MAX_INPUTVALUELEN) { size = WWW_CONF_MAX_INPUTVALUELEN - 1; } s.inputvaluesize = size; /* strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN);*/ } } } break; #endif /* WWW_CONF_FORMS */ #if WWW_CONF_RENDERSTATE case TAG_CENTER: /* parse_char(ISO_nl); */ newline(); htmlparser_renderstate(HTMLPARSER_RENDERSTATE_BEGIN | HTMLPARSER_RENDERSTATE_CENTER); break; case TAG_SLASHCENTER: /* parse_char(ISO_nl);*/ newline(); htmlparser_renderstate(HTMLPARSER_RENDERSTATE_END | HTMLPARSER_RENDERSTATE_CENTER); break; #endif /* WWW_CONF_RENDERSTATE */ } } /*-----------------------------------------------------------------------------------*/ static u16_t parse_word(char *data, u8_t dlen) { static u8_t i; static u8_t len; unsigned char c; len = dlen; switch(s.minorstate) { case MINORSTATE_TEXT: for(i = 0; i < len; ++i) { c = data[i]; if(iswhitespace(c)) { do_word(); } else if(c == ISO_lt) { s.minorstate = MINORSTATE_TAG; s.tagptr = 0; /* do_word();*/ break; } else if(c == ISO_ampersand) { s.minorstate = MINORSTATE_EXTCHAR; break; } else { add_char(c); } } break; case MINORSTATE_EXTCHAR: for(i = 0; i < len; ++i) { c = data[i]; if(c == ISO_semicolon) { s.minorstate = MINORSTATE_TEXT; add_char(' '); break; } else if(iswhitespace(c)) { s.minorstate = MINORSTATE_TEXT; add_char('&'); add_char(' '); break; } } break; case MINORSTATE_TAG: /* We are currently parsing within the name of a tag. We check for the end of a tag (the '>' character) or whitespace (which indicates that we should parse a tag attr argument instead). */ for(i = 0; i < len; ++i) { c = data[i]; if(c == ISO_gt) { /* Full tag found. We continue parsing regular text. */ s.minorstate = MINORSTATE_TEXT; s.tagattrptr = s.tagattrparamptr = 0; endtagfound(); parse_tag(); break; } else if(iswhitespace(c)) { /* The name of the tag found. We continue parsing the tag attr.*/ s.minorstate = MINORSTATE_TAGATTR; s.tagattrptr = 0; endtagfound(); break; } else { /* Keep track of the name of the tag, but convert it to lower case. */ s.tag[s.tagptr] = lowercase(c); ++s.tagptr; /* Check if the ->tag field is full. If so, we just eat up any data left in the tag. */ if(s.tagptr == sizeof(s.tag)) { s.minorstate = MINORSTATE_TAGEND; break; } } /* Check for HTML comment, indicated by