/*
  htmlchk -- checks if page has directives to explicit disable caching
  Copyright (C) 2003  Pedro Zorzenon Neto <pzn@autsens.com>

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "htmlchk.h"
#include "sqclient.h"
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <assert.h>

#define TMPFILE_TIMEOUT 50
#define BUFSIZE 32768

/* testing and debug purposes
#define HTMLCHKTEST
#define VERBOSE */

htmlchk_t * htmlchk_init (void) {
  htmlchk_t * self;
  char * fn = NULL;
  int maxtry = TMPFILE_TIMEOUT;
  
  while ( (fn==NULL) && (maxtry>0) )
    {
      int fd;
      /* I want "tmpnam", not "mkstemp", ignore warning when compiling */
      fn = tmpnam(NULL);
      fd = open(fn, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
      if (fd == -1)
	{
	  /* open error */
	  maxtry--;
	  fn=NULL;
	}
      else
	{
	  close(fd);
	}
    }
  
  if (fn==NULL)
    {
      fprintf(stderr, "Could not create a unique temporary file.\n");
      abort();
    }

  self = malloc (sizeof(htmlchk_t));
  assert(self != NULL);

  self->tmpfile = malloc (strlen(fn)+1);
  assert(self->tmpfile != NULL);

  strcpy(self->tmpfile, fn);

  return self;
}

int htmlchk_pagestate (htmlchk_t * self, char * url) {
  int comment = 0;
  int state=-1;
  char *buf;
  ssize_t size;
  int i, fd;
  
  i = sqclient_retrieve_if_in_cache (url, self->tmpfile);
  
  if (i != 1)
    {
      /* did not find page in cache */
      return 0;
    }

  /* page is in cache, lets scan it */
  fd = open (self->tmpfile, O_RDONLY);
  if ( fd == -1 )
    {
      fprintf(stderr, "could not open html tmpfile for scanning.\n");
      return -1;
    }

  buf = calloc ( BUFSIZE, 1);
  
  size = read (fd, buf, BUFSIZE-7); /* 7 is the lucky number :-) */

  for (i=0; i<size; i++) {
    
    if ( strncmp("<!--", buf+i, 4) == 0)
      {
	comment=1;
	i+=4;
      }
    else if ( strncmp("-->", buf+i, 3) == 0)
      {
	comment=0;
	i+=3;
      }
    
    if (comment==0) {
      if (*(buf+i)=='<')
	{
	  int tagbegin=i;
	  int tagend=i;
	  while ( (tagend<(BUFSIZE-1)) && 
		  ( *(buf+tagend) != '>') )
	    {
	      tagend++;
	      if (*(buf+tagend)=='<')
		{
		  /* found '<' inside a tag... */
		  tagend=BUFSIZE;
		}
	    }
	  if ( (tagend!=BUFSIZE) && 
	       ( (tagend-tagbegin) > 2 ) )
	    {
	      /* tag has begin and end */
	      int j;
	      int k=0;
	      char *meta;
	      
	      meta = malloc (tagend-tagbegin-1);
	      
	      for (j=(tagbegin+1); j<tagend; j++)
		{
		  char c;
		  
		  c = *(buf+j);
		  
		  if ( (c=='=') ||
		       (c=='-') ||
		       (c=='/') ||
		       ( (c>0x2F) && (c<0x3A) ) ||
		       ( (c>0x40) && (c<0x5B) ) ||
		       ( (c>0x60) && (c<0x7B) ) )
		    {
		      *(meta+k)=c;
		      k++;
		    }
		}
	      *(meta+k)=0; /* end of string */
	      
	      if ( (strcasecmp("/head",meta)==0) ||
		   (strcasecmp("body",meta)==0) )
		{
		  /* </head> or <body> found...
		   * will not find <meta> outside <head> or inside <body>*/
#ifdef VERBOSE
		  fprintf(stderr, "end of <head> element or begin of "
			  "<body> element found\n");
#endif
		  state=1;
		}
	      
	      if ( strcasecmp("metahttp-equiv=pragmacontent=no-cache",
			      meta)==0)
		{
		  /* found the right tag */
#ifdef VERBOSE
		  fprintf(stderr, "<meta http-equiv=\"pragma\" content"
			  "=\"no-cache\"> found\n"); 
#endif
		  state=2;
		}
	      
	      if ( strcasecmp("metahttp-equiv=cache-controlcontent=no-cache",
			      meta)==0)
		{
		  /* found the right tag */
#ifdef VERBOSE
		  fprintf(stderr, "<meta http-equiv=\"cache-control\" "
			  "content=\"no-cache\"> found\n"); 
#endif
		  state=2;
		}
	      
	      if ( strcasecmp("metahttp-equiv=expirescontent=0",
			      meta)==0)
		{
		  /* found the right tag */
#ifdef VERBOSE
		  fprintf(stderr, "<meta http-equiv=\"expires\" "
			  "content=\"0\"> found\n"); 
#endif
		  state=2;
		}
	      
	      if ( strcasecmp("metahttp-equiv=expirescontent=-1",
			      meta)==0)
		{
		  /* found the right tag */
#ifdef VERBOSE
		  fprintf(stderr, "<meta http-equiv=\"expires\" content"
			  "=\"-1\"> found\n"); 
#endif
		  state=2;
		}
	      
	      free(meta);
	      i=tagend+1;
	    }
	  
	}
      
    }
    
    if (state != -1)
      {
	break;
      }

  } /* end of for (i=0; i<size; i++) */

  close (fd);

  free(buf);

  if (state == -1)
    {
      state=1;
    }

  return state;
}


/* destroy htmlchk struct */
void htmlchk_destroy (htmlchk_t * self) {
  if (self != NULL)
    {
      if (self->tmpfile != NULL)
	{
	  remove(self->tmpfile);
	  free(self->tmpfile);
	}
      free(self);
    }
}

#ifdef HTMLCHKTEST
int main() {
  struct htmlchk *ht;
  char url[1024];
  int i;

  ht=htmlchk_init();
  
  printf("htmlchk tmpfilename: '%s'\n", ht->tmpfile);

  strcpy (url, "http://www.autsens.com/teste-0.htm");
  i = htmlchk_pagestate (ht, url);
  printf ("htmlchk %s pagestate=%d\n", url, i);
  if (i == 2)
    {
      printf("state is 2, purging from cache\n");
      sqclient_purge_from_cache(url);
    }
  
  printf("\n");
  
  strcpy (url, "http://www.autsens.com/teste-3.htm");
  i = htmlchk_pagestate (ht, url);
  printf ("htmlchk %s pagestate=%d\n", url, i);
  if (i == 2)
    {
      printf("state is 2, purging from cache\n");
      sqclient_purge_from_cache(url);
    }
  
  htmlchk_destroy(ht);
  return 0;
}
#endif
