Code Search for Developers
 
 
  

pattern.c from Gtk-Gnutella at Krugle


Show pattern.c syntax highlighted

/*
 * $Id: pattern.c 11811 2006-08-29 21:24:49Z rmanfredi $
 *
 * Copyright (c) 2001-2004, Raphael Manfredi
 *
 *----------------------------------------------------------------------
 * This file is part of gtk-gnutella.
 *
 *  gtk-gnutella is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  gtk-gnutella is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with gtk-gnutella; if not, write to the Free Software
 *  Foundation, Inc.:
 *      59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *----------------------------------------------------------------------
 */

/**
 * @ingroup lib
 * @file
 *
 * Pattern matching.
 *
 * @author Raphael Manfredi
 * @date 2001-2004
 */

#include "common.h"

RCSID("$Id: pattern.c 11811 2006-08-29 21:24:49Z rmanfredi $")

#include "misc.h"
#include "pattern.h"
#include "zalloc.h"
#include "override.h"		/* Must be the last header included */

static zone_t *pat_zone = NULL;		/**< Compiled patterns */

/**
 * Initialize pattern data structures.
 */
void
pattern_init(void)
{
	/*
	 * Patterns are not only used for query matching but also for filters,
	 * therefore we can expect quite a few to be created at the same time.
	 */

	pat_zone = zget(sizeof(cpattern_t), 64);
}

/**
 * Cleanup data structures.
 */
void
pattern_close(void)
{
	zdestroy(pat_zone);
}

/*
 * Pattern matching (substrings, not regular expressions)
 *
 * The algorithm used below is the one described in Communications
 * of the ACM, volume 33, number 8, August 1990, by Daniel M. Sunday
 * It's a variant of the classical Boyer-Moore search, but with a small
 * enhancement that can make a difference.
 */

/**
 * Compile given string pattern by computing the delta shift table.
 * The pattern string given is duplicated.
 *
 * @return a compiled pattern structure.
 */
cpattern_t *
pattern_compile(const gchar *pattern)
{
	cpattern_t *p = zalloc(pat_zone);
	size_t plen, i, *pd = p->delta;
	const guchar *c;

	p->pattern = g_strdup(pattern);
	p->len = plen = strlen(p->pattern);
	p->duped = TRUE;

	plen++;			/* Avoid increasing within the loop */

	for (i = 0; i < ALPHA_SIZE; i++)
		*pd++ = plen;

	plen--;			/* Restore original pattern length */

 	c = cast_to_gconstpointer(pattern);
	for (pd = p->delta, i = 0; i < plen; c++, i++)
		pd[*c] = plen - i;

	return p;
}

/**
 * Same as pattern_compile(), but the pattern string is NOT duplicated,
 * and its length is known upon entry.
 *
 * @attention
 * NB: There is no pattern_free_fast(), just call zfree() on the result.
 */
cpattern_t *
pattern_compile_fast(const gchar *pattern, size_t plen)
{
	cpattern_t *p = zalloc(pat_zone);
	size_t i, *pd = p->delta;
	const guchar *c;

	p->pattern = pattern;
	p->len = plen;
	p->duped = FALSE;

	plen++;			/* Avoid increasing within the memset() inlined macro */

	for (i = 0; i < ALPHA_SIZE; i++)
		*pd++ = plen;

	plen--;			/* Restore original pattern length */

 	c = cast_to_gconstpointer(pattern);
	for (pd = p->delta, i = 0; i < plen; c++, i++)
		pd[*c] = plen - i;

	return p;
}

/**
 * Dispose of compiled pattern.
 */
void
pattern_free(cpattern_t *cpat)
{
	if (cpat->duped) {
		g_free(deconstify_gchar(cpat->pattern));
		cpat->pattern = NULL;
	}
	zfree(pat_zone, cpat);
}

/**
 * Quick substring search algorithm.  It looks for the compiled pattern
 * with `text', from left to right.  The `tlen' argument is the length
 * of the text, and can left to 0, in which case it will be computed.
 *
 * @return pointer to beginning of matching substring, NULL if not found.
 */
const gchar *
pattern_qsearch(
	cpattern_t *cpat,		/**< Compiled pattern */
	const gchar *text,		/**< Text we're scanning */
	size_t tlen,			/**< Text length, 0 = compute strlen(text) */
	size_t toffset,			/**< Offset within text for search start */
	qsearch_mode_t word)	/**< Beginning/whole word matching? */
{
	const gchar *p;			/* Pointer within string pattern */
	const gchar *t;			/* Pointer within text */
	const gchar *tp;		/* Initial local search text pointer */
	const gchar *start;		/* Start of matching */
	const gchar *end;		/* End of text (first byte after physical end) */
	size_t i;				/* Position within pattern string */
	size_t plen;

	if (!tlen)
		tlen = strlen(text);
	start = text + toffset;
	end = text + tlen;
	tp = start;
	plen = cpat->len;

	while (tp + plen <= end) {		/* Enough text left for matching */

		for (p = cpat->pattern, t = tp, i = 0; i < plen; p++, t++, i++)
			if (*p != *t)
				break;				/* Mismatch, stop looking here */

		if (i == plen) {			/* OK, we got a pattern match */
			gboolean at_begin = FALSE;

			if (word == qs_any)
				return tp;			/* Start of substring */

			/*
			 * They set `word', so we must look whether we are at the start
			 * of a word, i.e. if it is either the beginning of the text,
			 * or if the character before is a non-alphanumeric character.
			 */

			g_assert(word == qs_begin || word == qs_whole);

			if (tp == text) {					/* At beginning of text */
				if (word == qs_begin) return tp;
				else at_begin = TRUE;
			} else if (0x20 == *(tp-1)) {	/* At word boundary */
				if (word == qs_begin) return tp;
				else at_begin = TRUE;
			}

			if (at_begin && word == qs_whole) {
				if (&tp[plen] == end)			/* At end of string */
					return tp;
				else if (0x20 == tp[plen])
					return tp; /* At word boundary after */
			}

			/* Fall through */
		}

		tp += cpat->delta[(guchar) tp[plen]]; /* Continue search there */
	}

	return NULL;		/* Not found */
}

/* vi: set ts=4 sw=4 cindent: */




See more files for this project here

Gtk-Gnutella

A GTK+ Gnutella client for Unix, efficient, reliable and fast, written in C. It has been optimized for speed and scalability, with low-memory consumption. It is meant to be left running 24x7, using little CPU and only the configured bandwidth.

Project homepage: http://sourceforge.net/projects/gtk-gnutella
Programming language(s): C
License: other

  Jmakefile
  Makefile.SH
  adns.c
  adns.h
  aging.c
  aging.h
  array.h
  atoms.c
  atoms.h
  base16.c
  base16.h
  base32.c
  base32.h
  base64.c
  base64.h
  bg.c
  bg.h
  bit_array.h
  cobs.c
  cobs.h
  cq.c
  cq.h
  crash.c
  crash.h
  crc.c
  crc.h
  dbus_util.c
  dbus_util.h
  endian.h
  eval.c
  eval.h
  event.c
  event.h
  fast_assert.c
  fast_assert.h
  fifo.c
  fifo.h
  file.c
  file.h
  fragcheck.c
  fragcheck.h
  getdate.c
  getdate.h
  getdate.y
  getline.c
  getline.h
  getphysmemsize.c
  getphysmemsize.h
  glib-missing.c
  glib-missing.h
  halloc.c
  halloc.h
  hashlist.c
  hashlist.h
  hashtable.c
  hashtable.h
  header.c
  header.h
  host_addr.c
  host_addr.h
  html.c
  html.h
  html_entities.h
  idtable.c
  idtable.h
  inputevt.c
  inputevt.h
  iovec.h
  iprange.c
  iprange.h
  iso3166.c
  iso3166.h
  list.c
  list.h
  listener.h
  magnet.c
  magnet.h
  malloc.c
  malloc.h
  misc.c
  misc.h
  options.c
  options.h
  override.h
  pagetable.c
  pagetable.h
  palloc.c
  palloc.h
  pattern.c
  pattern.h
  prop.c
  prop.h
  sbool.h
  sha1.c
  sha1.h
  slist.c
  slist.h
  socket.c
  socket.h
  sorted_array.c
  sorted_array.h
  stats.c