Artifact c92ed57e57fac105959061e627aa386fdaf455fd
File
src/tokenize_path.c
part of check-in
[7f9226a858]
- added tokenize_path.c, containing a convenient path-like-string tokenizer and render_linked_path() to HTML-ize a path.
by
stephan on
2008-02-06 19:02:35.
#include <string.h>
#include <stdlib.h>
/**
tokenize_path_free() is the only publically-defined way to deallocate
a string array created by tokenize_path(). It must be called exactly
once for each return value from tokenize_path(). Failing to call it
will result in a memory leak.
If (!p) then this function does nothing. Passing a pointer which was
not returned from tokenize_path() will result in undefined behaviour.
After calling this, p's contents are invalid.
*/
void tokenize_path_free( char ** p )
{
if( p )
{
/* Free the tokenized strings (a single string, actually): */
free( *(p-1) );
/* Free p from its REAL starting point. */
free( p-1 );
}
}
/**
tokenize_path_is_separator() is the default predicate function for
tokenize_path(). It returns 1 if (c == '/'), else it returns 0.
*/
int tokenize_path_is_separator( int c )
{
return (c == '/');
}
/**
tokenize_path() takes a string, assumed to be a delimited
null-terminated path-style string (like a path to a file), and
tokenizes it into its component parts. The 'out' parameter (if not
null) is set to the number of tokenized items (may be 0).
The third argument is a unary predicate function which takes
a single character and returns true only if that character
is a "separator character". If the 3rd argument is 0 then
tokenize_path_is_separator() is used.
The function returns a list of strings (or 0) which must be freed
via tokenize_path_free() because the internal allocation of the
return result is a bit tricky (to minimize on allocations). DO NOT
pass the return result to free(), as that will cause undefined
behaviour. Because the returned array is null-terminated, the second
parameter is normally not needed because the array can safely
be looped over without knowing its length in advance. Nonetheless,
having the count before looping may be useful for some cases.
However,
the returned array
The returned string array is always null-terminated, to simplify
looping over it. The function returns null if the input string is
null, empty, or contains only separator characters.
Tokenizing behaviour:
- It assumes that ALL non-separator chars are entry names.
- It treats runs of multiple separators chars as a single
separator, NOT as a series of empty tokens.
- It has no knowledge of relative or absolute paths, so
"." and ".." are considered to be normal entries.
- The returned strings are non-const, but the caller must not
change their sizes or reallocate them at different memory
addresses. The only legal way to deallocate them is with
tokenize_path_free(). Changing the string content IS is legal.
e.g.:
"/path/to/nowhere" and "path/to///nowhere/" both parse to:
Parses to: { "path", "to", "nowhere" }
"/./../"
Parses to: { ".", ".." }
"http://foo.com/bar"
Parses to: { "http:", "foo.com", "bar" }
(Note that those arrays all have an implicit NULL entry as their
last element. )
*/
char ** tokenize_path( char const * in,
int * out,
int (*predicate)( int )
)
{ /* Author: sgbeal@googlemail.com. License: Public Domain. */
int ignored;
if( ! out ) out = &ignored;
*out = 0;
typedef int (*sep_f)( int );
sep_f is_sep = (predicate ? predicate : tokenize_path_is_separator);
int inlen = strlen(in);
if( (! in) || (0==inlen) ) return 0;
char * cp = malloc( inlen + 1 );
/**
We make a copy because:
Our algorithm is to replace separators with 0 in our copy, and
use that copy as our return value. This allows us to avoid
allocating a new string for each returned result.
*/
strcpy( cp, in );
/**
buffsize = the largest possible number of return result we can
have, plus 1 (to allow for truncated division). The maximum size
is determined based on worst-case scenario: a list of single
characters, each separated by one separators, e.g. "/1/1/1/1/1"
*/
const int buffsize = inlen / 2 + 1;
/* 'starts' stores the starting point of each path component
substring of 'cp'. When we slice up 'cp' below, starts[x]
will be set to point to a particular position within 'cp'.
That allows us to avoid allocating/copying each element
separately.
*/
char * starts[buffsize];
int i = 0;
for( i = 0; i < buffsize; ++i ) starts[i] = 0;
char * curs = cp;
for( curs = cp; is_sep(*curs); ++curs );
/* ^^^ We skip leading separators so we can easily
mark where the first entry string actually begins.
*/
if( '\0' == curs )
{
free( cp );
return 0;
}
char * mark = curs; /* placeholder for holding the head addr of strings. */
int count = 0; /* total number of elements we end up tokenizing. */
int started = 0; /* toggled when we enter a new path element. */
for( ; *curs != '\0'; ++curs )
{
/** Replace '/' with '\0'... */
if( is_sep(*curs) )
{
*curs = '\0';
mark = curs+1;
started = 0;
continue;
}
if( ! started )
{ /** Start a new path element... */
starts[count] = mark;
started = 1;
++count;
}
}
if( ! starts[0] )
{
free( cp );
return 0;
}
cp[inlen] = '\0';
char ** ret = calloc( count + 2, sizeof(char*) );
/* We over-allocate by 2 entries. The first one holds the address of
'cp' and the last one is set to 0 to simplify looping over the
array. */
*out = count;
ret[0] = cp;
++ret;
/**
We're going to hide that [0] entry from the caller. Instead, we
use that to hold the address of 'cp'. In tokenize_path_free()
we release both that string and (ret-1).
*/
for( i = 0; i < count; ++i )
{
ret[i] = starts[i];
}
ret[count] = 0;
return ret;
}
extern void cgi_printf(const char *zFormat,...);
/**
render_linked_path() takes a root path and a /unix/style/path and
renders (using cgi_printf()) a clickable list of the entries in the
path. If path is null it does nothing. If root is null it is treated
as an empty string.
Example:
render_linked_path( "/AAA", "b/c/d" );
It would render a list similar to the following,
but think of the text in [brackets] as hyperlinked:
[b]/[c]/[d]
Each element is linked to a path like so:
b: root/b
c: root/b/c
d: root/b/c/d
If root is null then the 'root/' part is not applied.
*/
void render_linked_path( char const * root,
char const * path )
{
int count = 0;
char ** toks = tokenize_path( path, &count, 0 );
if( ! toks ) return;
char const * t = 0;
int pos = 0;
for( t = toks[pos]; t; t = toks[++pos] )
{
cgi_printf( "<a href='" );
if( root )
{
cgi_printf( "%s/", root );
}
int bpos = 0;
for( ; bpos < pos; ++bpos )
{
cgi_printf("%s/", toks[bpos] );
}
cgi_printf("%s'>%s</a>", t, t );
if( pos != (count-1) )
{
cgi_printf("/");
}
}
tokenize_path_free( toks );
}
#if 0 /* set to 1 to compile a test app. */
#include <stdio.h>
static int sep_char = '?';
static int is_sep_char( int c )
{
return c == sep_char;
}
int main( int argc, char ** argv )
{
int count = 0;
sep_char = ( (argc>2) ? (argv[2])[0] : '/');
printf( "sep_char==%c\n",sep_char);
char ** l = tokenize_path( argc==1 ? 0 : argv[1],
&count,
is_sep_char );
printf( "parsed path: count=%d\n", count );
if( ! count )
{
printf("error: path didn't parse :(\n");
return 1;
}
char * x;
int i = 0;
for( x = l[0]; x; x = l[++i] )
{
printf( "\t%s\n", x );
}
tokenize_path_free( l );
printf( "Bye!\n");
return 0;
}
#endif