PostgreSQL 8.3beta1 Documentation | ||||
---|---|---|---|---|
Prev | Fast Backward | Chapter 12. Full Text Search | Fast Forward | Next |
The motivation for this example dictionary is to control the indexing of integers (signed and unsigned), and, consequently, to minimize the number of unique words which greatly affects to performance of searching.
The dictionary accepts two options:
The MAXLEN parameter specifies the maximum length of the number considered as a 'good' integer. The default value is 6.
The REJECTLONG parameter specifies if a 'long' integer should be indexed or treated as a stop word. If REJECTLONG=FALSE (default), the dictionary returns the prefixed part of the integer with length MAXLEN. If REJECTLONG=TRUE, the dictionary considers a long integer as a stop word.
A similar idea can be applied to the indexing of decimal numbers, for example, in the DecDict dictionary. The dictionary accepts two options: the MAXLENFRAC parameter specifies the maximum length of the fractional part considered as a 'good' decimal. The default value is 3. The REJECTLONG parameter controls whether a decimal number with a 'long' fractional part should be indexed or treated as a stop word. If REJECTLONG=FALSE (default), the dictionary returns the decimal number with the length of its fraction part truncated to MAXLEN. If REJECTLONG=TRUE, the dictionary considers the number as a stop word. Notice that REJECTLONG=FALSE allows the indexing of 'shortened' numbers and search results will contain documents with shortened numbers.
Examples:
SELECT ts_lexize('intdict', 11234567890); ts_lexize ----------- {112345}
Now, we want to ignore long integers:
ALTER TEXT SEARCH DICTIONARY intdict ( MAXLEN = 6, REJECTLONG = TRUE ); SELECT ts_lexize('intdict', 11234567890); ts_lexize ----------- {}
Create contrib/dict_intdict directory with files dict_tmpl.c, Makefile, dict_intdict.sql.in:
$ make && make install $ psql DBNAME < dict_intdict.sql
This is a dict_tmpl.c file:
#include "postgres.h" #include "utils/builtins.h" #include "fmgr.h" #ifdef PG_MODULE_MAGIC PG_MODULE_MAGIC; #endif #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" #include "tsearch/ts_utils.h" typedef struct { int maxlen; bool rejectlong; } DictInt; PG_FUNCTION_INFO_V1(dinit_intdict); Datum dinit_intdict(PG_FUNCTION_ARGS); Datum dinit_intdict(PG_FUNCTION_ARGS) { DictInt *d = (DictInt*)malloc( sizeof(DictInt) ); Map *cfg, *pcfg; text *in; if (!d) elog(ERROR, "No memory"); memset(d, 0, sizeof(DictInt)); /* Your INIT code */ /* defaults */ d->maxlen = 6; d->rejectlong = false; if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) /* no options */ PG_RETURN_POINTER(d); in = PG_GETARG_TEXT_P(0); parse_keyvalpairs(in, &cfg); PG_FREE_IF_COPY(in, 0); pcfg=cfg; while (pcfg->key) { if (strcasecmp("MAXLEN", pcfg->key) == 0) d->maxlen=atoi(pcfg->value); else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) { if ( strcasecmp("true", pcfg->value) == 0 ) d->rejectlong=true; else if ( strcasecmp("false", pcfg->value) == 0) d->rejectlong=false; else elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value); } else elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value); pfree(pcfg->key); pfree(pcfg->value); pcfg++; } pfree(cfg); PG_RETURN_POINTER(d); } PG_FUNCTION_INFO_V1(dlexize_intdict); Datum dlexize_intdict(PG_FUNCTION_ARGS); Datum dlexize_intdict(PG_FUNCTION_ARGS) { DictInt *d = (DictInt*)PG_GETARG_POINTER(0); char *in = (char*)PG_GETARG_POINTER(1); char *txt = pnstrdup(in, PG_GETARG_INT32(2)); TSLexeme *res = palloc(sizeof(TSLexeme) * 2); /* Your INIT dictionary code */ res[1].lexeme = NULL; if (PG_GETARG_INT32(2) > d->maxlen) { if (d->rejectlong) { /* stop, return void array */ pfree(txt); res[0].lexeme = NULL; } else { /* cut integer */ txt[d->maxlen] = '\0'; res[0].lexeme = txt; } } else res[0].lexeme = txt; PG_RETURN_POINTER(res); }
This is the Makefile:
subdir = contrib/dict_intdict top_builddir = ../.. include $(top_builddir)/src/Makefile.global MODULE_big = dict_intdict OBJS = dict_tmpl.o DATA_built = dict_intdict.sql DOCS = include $(top_srcdir)/contrib/contrib-global.mk
This is a dict_intdict.sql.in:
SET default_text_search_config = 'english'; BEGIN; CREATE OR REPLACE FUNCTION dinit_intdict(internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE 'C'; CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE 'C' WITH (isstrict); CREATE TEXT SEARCH TEMPLATE intdict_template ( LEXIZE = dlexize_intdict, INIT = dinit_intdict ); CREATE TEXT SEARCH DICTIONARY intdict ( TEMPLATE = intdict_template, MAXLEN = 6, REJECTLONG = false ); COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers'; END;