mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-05-16 22:02:58 -06:00
1859 lines
55 KiB
C
1859 lines
55 KiB
C
/*====================================================================*
|
|
- Copyright (C) 2001 Leptonica. All rights reserved.
|
|
- This software is distributed in the hope that it will be
|
|
- useful, but with NO WARRANTY OF ANY KIND.
|
|
- No author or distributor accepts responsibility to anyone for the
|
|
- consequences of using this software, or for whether it serves any
|
|
- particular purpose or works at all, unless he or she says so in
|
|
- writing. Everyone is granted permission to copy, modify and
|
|
- redistribute this source code, for commercial or non-commercial
|
|
- purposes, with the following restrictions: (1) the origin of this
|
|
- source code must not be misrepresented; (2) modified versions must
|
|
- be plainly marked as such; and (3) this notice may not be removed
|
|
- or altered from any source or modified source distribution.
|
|
*====================================================================*/
|
|
|
|
|
|
/*
|
|
* sarray.c
|
|
*
|
|
* Create/Destroy/Copy
|
|
* SARRAY *sarrayCreate()
|
|
* SARRAY *sarrayCreateInitialized()
|
|
* SARRAY *sarrayCreateWordsFromString()
|
|
* SARRAY *sarrayCreateLinesFromString()
|
|
* void *sarrayDestroy()
|
|
* SARRAY *sarrayCopy()
|
|
* SARRAY *sarrayClone()
|
|
*
|
|
* Add/Remove string
|
|
* l_int32 sarrayAddString()
|
|
* l_int32 sarrayExtendArray()
|
|
* char *sarrayRemoveString()
|
|
* l_int32 sarrayReplaceString()
|
|
* l_int32 sarrayClear()
|
|
*
|
|
* Accessors
|
|
* l_int32 sarrayGetCount()
|
|
* char **sarrayGetArray()
|
|
* char *sarrayGetString()
|
|
* l_int32 sarrayGetRefcount()
|
|
* l_int32 sarrayChangeRefcount()
|
|
*
|
|
* Conversion back to string
|
|
* char *sarrayToString()
|
|
* char *sarrayToStringRange()
|
|
*
|
|
* Concatenate 2 sarrays
|
|
* l_int32 sarrayConcatenate()
|
|
* l_int32 sarrayAppendRange()
|
|
*
|
|
* Pad an sarray to be the same size as another sarray
|
|
* l_int32 sarrayPadToSameSize()
|
|
*
|
|
* Convert word sarray to (formatted) line sarray
|
|
* SARRAY *sarrayConvertWordsToLines()
|
|
*
|
|
* Split string on separator list
|
|
* SARRAY *sarraySplitString()
|
|
*
|
|
* Filter sarray
|
|
* SARRAY *sarraySelectBySubstring()
|
|
* SARRAY *sarraySelectByRange()
|
|
* l_int32 sarrayParseRange()
|
|
*
|
|
* Sort
|
|
* SARRAY *sarraySort()
|
|
* l_int32 stringCompareLexical()
|
|
*
|
|
* Serialize for I/O
|
|
* SARRAY *sarrayRead()
|
|
* SARRAY *sarrayReadStream()
|
|
* l_int32 sarrayWrite()
|
|
* l_int32 sarrayWriteStream()
|
|
* l_int32 sarrayAppend()
|
|
*
|
|
* Directory filenames
|
|
* SARRAY *getNumberedPathnamesInDirectory()
|
|
* SARRAY *getSortedPathnamesInDirectory()
|
|
* SARRAY *getFilenamesInDirectory()
|
|
*
|
|
* Comments on usage:
|
|
*
|
|
* These functions are important for efficient manipulation
|
|
* of string data. They have been used in leptonica for
|
|
* generating and parsing text files, and for generating
|
|
* code for compilation. The user is responsible for
|
|
* correctly disposing of strings that have been extracted
|
|
* from sarrays.
|
|
*
|
|
* - When you want a string from an Sarray to inspect it, or
|
|
* plan to make a copy of it later, use sarrayGetString()
|
|
* with copyflag = 0. In this case, you must neither free
|
|
* the string nor put it directly in another array.
|
|
* We provide the copyflag constant L_NOCOPY, which is 0,
|
|
* for this purpose:
|
|
* str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
|
|
* To extract a copy of a string, use:
|
|
* str-owned = sarrayGetString(sa, index, L_COPY);
|
|
*
|
|
* - When you want to insert a string that is in one
|
|
* array into another array (always leaving the first
|
|
* array intact), you have two options:
|
|
* (1) use copyflag = L_COPY to make an immediate copy,
|
|
* which you must then add to the second array
|
|
* by insertion; namely,
|
|
* str-owned = sarrayGetString(sa, index, L_COPY);
|
|
* sarrayAddString(sa, str-owned, L_INSERT);
|
|
* (2) use copyflag = L_NOCOPY to get another handle to
|
|
* the string, in which case you must add
|
|
* a copy of it to the second string array:
|
|
* str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
|
|
* sarrayAddString(sa, str-not-owned, L_COPY).
|
|
*
|
|
* In all cases, when you use copyflag = L_COPY to extract
|
|
* a string from an array, you must either free it
|
|
* or insert it in an array that will be freed later.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#ifndef COMPILER_MSVC
|
|
#include <dirent.h> /* unix only */
|
|
#endif /* !COMPILER_MSVC */
|
|
#include "allheaders.h"
|
|
|
|
static const l_int32 INITIAL_PTR_ARRAYSIZE = 50; /* n'importe quoi */
|
|
static const l_int32 L_BUF_SIZE = 512;
|
|
|
|
|
|
/*--------------------------------------------------------------------------*
|
|
* String array create/destroy/copy/extend *
|
|
*--------------------------------------------------------------------------*/
|
|
/*!
|
|
* sarrayCreate()
|
|
*
|
|
* Input: size of string ptr array to be alloc'd
|
|
* (use 0 for default)
|
|
* Return: sarray, or null on error
|
|
*/
|
|
SARRAY *
|
|
sarrayCreate(l_int32 n)
|
|
{
|
|
SARRAY *sa;
|
|
|
|
PROCNAME("sarrayCreate");
|
|
|
|
if (n <= 0)
|
|
n = INITIAL_PTR_ARRAYSIZE;
|
|
|
|
if ((sa = (SARRAY *)CALLOC(1, sizeof(SARRAY))) == NULL)
|
|
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
|
|
if ((sa->array = (char **)CALLOC(n, sizeof(char *))) == NULL)
|
|
return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL);
|
|
|
|
sa->nalloc = n;
|
|
sa->n = 0;
|
|
sa->refcount = 1;
|
|
return sa;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayCreateInitialized()
|
|
*
|
|
* Input: n (size of string ptr array to be alloc'd)
|
|
* initstr (string to be initialized on the full array)
|
|
* Return: sarray, or null on error
|
|
*/
|
|
SARRAY *
|
|
sarrayCreateInitialized(l_int32 n,
|
|
char *initstr)
|
|
{
|
|
l_int32 i;
|
|
SARRAY *sa;
|
|
|
|
PROCNAME("sarrayCreateInitialized");
|
|
|
|
if (n <= 0)
|
|
return (SARRAY *)ERROR_PTR("n must be > 0", procName, NULL);
|
|
if (!initstr)
|
|
return (SARRAY *)ERROR_PTR("initstr not defined", procName, NULL);
|
|
|
|
sa = sarrayCreate(n);
|
|
for (i = 0; i < n; i++)
|
|
sarrayAddString(sa, initstr, L_COPY);
|
|
return sa;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayCreateWordsFromString()
|
|
*
|
|
* Input: string
|
|
* Return: sarray, or null on error
|
|
*
|
|
* Notes:
|
|
* (1) This finds the number of word substrings, creates an sarray
|
|
* of this size, and puts copies of each substring into the sarray.
|
|
*/
|
|
SARRAY *
|
|
sarrayCreateWordsFromString(const char *string)
|
|
{
|
|
char separators[] = " \n\t";
|
|
l_int32 i, nsub, size, inword;
|
|
SARRAY *sa;
|
|
|
|
PROCNAME("sarrayCreateWordsFromString");
|
|
|
|
if (!string)
|
|
return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
|
|
|
|
/* Find the number of words */
|
|
size = strlen(string);
|
|
nsub = 0;
|
|
inword = FALSE;
|
|
for (i = 0; i < size; i++) {
|
|
if (inword == FALSE &&
|
|
(string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
|
|
inword = TRUE;
|
|
nsub++;
|
|
}
|
|
else if (inword == TRUE &&
|
|
(string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
|
|
inword = FALSE;
|
|
}
|
|
}
|
|
|
|
if ((sa = sarrayCreate(nsub)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
|
|
sarraySplitString(sa, string, separators);
|
|
|
|
return sa;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayCreateLinesFromString()
|
|
*
|
|
* Input: string
|
|
* blankflag (0 to exclude blank lines; 1 to include)
|
|
* Return: sarray, or null on error
|
|
*
|
|
* Notes:
|
|
* (1) This finds the number of line substrings, creates an sarray of
|
|
* this size, and puts copies of each substring into the sarray.
|
|
*/
|
|
SARRAY *
|
|
sarrayCreateLinesFromString(char *string,
|
|
l_int32 blankflag)
|
|
{
|
|
l_int32 i, nsub, size, startptr;
|
|
char *cstring, *substring;
|
|
SARRAY *sa;
|
|
|
|
PROCNAME("sarrayCreateLinesFromString");
|
|
|
|
if (!string)
|
|
return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
|
|
|
|
/* find the number of lines */
|
|
size = strlen(string);
|
|
nsub = 0;
|
|
for (i = 0; i < size; i++) {
|
|
if (string[i] == '\n')
|
|
nsub++;
|
|
}
|
|
|
|
if ((sa = sarrayCreate(nsub)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
|
|
|
|
if (blankflag) { /* keep blank lines as null strings */
|
|
/* Make a copy for munging */
|
|
if ((cstring = stringNew(string)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL);
|
|
/* We'll insert nulls like strtok */
|
|
startptr = 0;
|
|
for (i = 0; i < size; i++) {
|
|
if (cstring[i] == '\n') {
|
|
cstring[i] = '\0';
|
|
if ((substring = stringNew(cstring + startptr)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("substring not made",
|
|
procName, NULL);
|
|
sarrayAddString(sa, substring, L_INSERT);
|
|
/* fprintf(stderr, "substring = %s\n", substring); */
|
|
startptr = i + 1;
|
|
}
|
|
}
|
|
if (startptr < size) { /* no newline at end of last line */
|
|
if ((substring = stringNew(cstring + startptr)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("substring not made",
|
|
procName, NULL);
|
|
sarrayAddString(sa, substring, L_INSERT);
|
|
/* fprintf(stderr, "substring = %s\n", substring); */
|
|
}
|
|
FREE(cstring);
|
|
}
|
|
else { /* remove blank lines; use strtok */
|
|
sarraySplitString(sa, string, "\n");
|
|
}
|
|
|
|
return sa;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayDestroy()
|
|
*
|
|
* Input: &sarray <to be nulled>
|
|
* Return: void
|
|
*
|
|
* Notes:
|
|
* (1) Decrements the ref count and, if 0, destroys the sarray.
|
|
* (2) Always nulls the input ptr.
|
|
*/
|
|
void
|
|
sarrayDestroy(SARRAY **psa)
|
|
{
|
|
l_int32 i;
|
|
SARRAY *sa;
|
|
|
|
PROCNAME("sarrayDestroy");
|
|
|
|
if (psa == NULL) {
|
|
L_WARNING("ptr address is NULL!", procName);
|
|
return;
|
|
}
|
|
if ((sa = *psa) == NULL)
|
|
return;
|
|
|
|
sarrayChangeRefcount(sa, -1);
|
|
if (sarrayGetRefcount(sa) <= 0) {
|
|
if (sa->array) {
|
|
for (i = 0; i < sa->n; i++)
|
|
FREE(sa->array[i]);
|
|
FREE(sa->array);
|
|
}
|
|
FREE(sa);
|
|
}
|
|
|
|
*psa = NULL;
|
|
return;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayCopy()
|
|
*
|
|
* Input: sarray
|
|
* Return: copy of sarray, or null on error
|
|
*/
|
|
SARRAY *
|
|
sarrayCopy(SARRAY *sa)
|
|
{
|
|
l_int32 i;
|
|
SARRAY *csa;
|
|
|
|
PROCNAME("sarrayCopy");
|
|
|
|
if (!sa)
|
|
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
|
|
|
|
if ((csa = sarrayCreate(sa->nalloc)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("csa not made", procName, NULL);
|
|
|
|
for (i = 0; i < sa->n; i++)
|
|
sarrayAddString(csa, sa->array[i], L_COPY);
|
|
|
|
return csa;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayClone()
|
|
*
|
|
* Input: sarray
|
|
* Return: ptr to same sarray, or null on error
|
|
*/
|
|
SARRAY *
|
|
sarrayClone(SARRAY *sa)
|
|
{
|
|
PROCNAME("sarrayClone");
|
|
|
|
if (!sa)
|
|
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
|
|
sarrayChangeRefcount(sa, 1);
|
|
return sa;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayAddString()
|
|
*
|
|
* Input: sarray
|
|
* string (string to be added)
|
|
* copyflag (L_INSERT, L_COPY)
|
|
* Return: 0 if OK, 1 on error
|
|
*
|
|
* Notes:
|
|
* (1) Legacy usage decrees that we always use 0 to insert a string
|
|
* directly and 1 to insert a copy of the string. The
|
|
* enums for L_INSERT and L_COPY agree with this convention,
|
|
* and will not change in the future.
|
|
* (2) See usage comments at the top of this file.
|
|
*/
|
|
l_int32
|
|
sarrayAddString(SARRAY *sa,
|
|
char *string,
|
|
l_int32 copyflag)
|
|
{
|
|
l_int32 n;
|
|
|
|
PROCNAME("sarrayAddString");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
if (!string)
|
|
return ERROR_INT("string not defined", procName, 1);
|
|
if (copyflag != L_INSERT && copyflag != L_COPY)
|
|
return ERROR_INT("invalid copyflag", procName, 1);
|
|
|
|
n = sarrayGetCount(sa);
|
|
if (n >= sa->nalloc)
|
|
sarrayExtendArray(sa);
|
|
|
|
if (copyflag == L_INSERT)
|
|
sa->array[n] = string;
|
|
else /* L_COPY */
|
|
sa->array[n] = stringNew(string);
|
|
sa->n++;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayExtendArray()
|
|
*
|
|
* Input: sarray
|
|
* Return: 0 if OK, 1 on error
|
|
*/
|
|
l_int32
|
|
sarrayExtendArray(SARRAY *sa)
|
|
{
|
|
PROCNAME("sarrayExtendArray");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
|
|
if ((sa->array = (char **)reallocNew((void **)&sa->array,
|
|
sizeof(char *) * sa->nalloc,
|
|
2 * sizeof(char *) * sa->nalloc)) == NULL)
|
|
return ERROR_INT("new ptr array not returned", procName, 1);
|
|
|
|
sa->nalloc *= 2;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayRemoveString()
|
|
*
|
|
* Input: sarray
|
|
* index (of string within sarray)
|
|
* Return: removed string, or null on error
|
|
*/
|
|
char *
|
|
sarrayRemoveString(SARRAY *sa,
|
|
l_int32 index)
|
|
{
|
|
char *string;
|
|
char **array;
|
|
l_int32 i, n, nalloc;
|
|
|
|
PROCNAME("sarrayRemoveString");
|
|
|
|
if (!sa)
|
|
return (char *)ERROR_PTR("sa not defined", procName, NULL);
|
|
|
|
if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
|
|
return (char *)ERROR_PTR("array not returned", procName, NULL);
|
|
|
|
if (index < 0 || index >= n)
|
|
return (char *)ERROR_PTR("array index out of bounds", procName, NULL);
|
|
|
|
string = array[index];
|
|
|
|
/* If removed string is not at end of array, shift
|
|
* to fill in, maintaining original ordering.
|
|
* Note: if we didn't care about the order, we could
|
|
* put the last string array[n - 1] directly into the hole. */
|
|
for (i = index; i < n - 1; i++)
|
|
array[i] = array[i + 1];
|
|
|
|
sa->n--;
|
|
return string;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayReplaceString()
|
|
*
|
|
* Input: sarray
|
|
* index (of string within sarray to be replaced)
|
|
* newstr (string to replace existing one)
|
|
* copyflag (L_INSERT, L_COPY)
|
|
* Return: 0 if OK, 1 on error
|
|
*
|
|
* Notes:
|
|
* (1) This destroys an existing string and replaces it with
|
|
* the new string or a copy of it.
|
|
* (2) By design, an sarray is always compacted, so there are
|
|
* never any holes (null ptrs) in the ptr array up to the
|
|
* current count.
|
|
*/
|
|
l_int32
|
|
sarrayReplaceString(SARRAY *sa,
|
|
l_int32 index,
|
|
char *newstr,
|
|
l_int32 copyflag)
|
|
{
|
|
char *str;
|
|
l_int32 n;
|
|
|
|
PROCNAME("sarrayReplaceString");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
n = sarrayGetCount(sa);
|
|
if (index < 0 || index >= n)
|
|
return ERROR_INT("array index out of bounds", procName, 1);
|
|
if (!newstr)
|
|
return ERROR_INT("newstr not defined", procName, 1);
|
|
if (copyflag != L_INSERT && copyflag != L_COPY)
|
|
return ERROR_INT("invalid copyflag", procName, 1);
|
|
|
|
FREE(sa->array[index]);
|
|
if (copyflag == L_INSERT)
|
|
str = newstr;
|
|
else /* L_COPY */
|
|
str = stringNew(newstr);
|
|
sa->array[index] = str;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayClear()
|
|
*
|
|
* Input: sarray
|
|
* Return: 0 if OK; 1 on error
|
|
*/
|
|
l_int32
|
|
sarrayClear(SARRAY *sa)
|
|
{
|
|
l_int32 i;
|
|
|
|
PROCNAME("sarrayClear");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
for (i = 0; i < sa->n; i++) { /* free strings and null ptrs */
|
|
FREE(sa->array[i]);
|
|
sa->array[i] = NULL;
|
|
}
|
|
sa->n = 0;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Accessors *
|
|
*----------------------------------------------------------------------*/
|
|
/*!
|
|
* sarrayGetCount()
|
|
*
|
|
* Input: sarray
|
|
* Return: count, or 0 if no strings or on error
|
|
*/
|
|
l_int32
|
|
sarrayGetCount(SARRAY *sa)
|
|
{
|
|
PROCNAME("sarrayGetCount");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 0);
|
|
return sa->n;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayGetArray()
|
|
*
|
|
* Input: sarray
|
|
* &nalloc (<optional return> number allocated string ptrs)
|
|
* &n (<optional return> number allocated strings)
|
|
* Return: ptr to string array, or null on error
|
|
*
|
|
* Notes:
|
|
* (1) Caution: the returned array is not a copy, so caller
|
|
* must not destroy it!
|
|
*/
|
|
char **
|
|
sarrayGetArray(SARRAY *sa,
|
|
l_int32 *pnalloc,
|
|
l_int32 *pn)
|
|
{
|
|
char **array;
|
|
|
|
PROCNAME("sarrayGetArray");
|
|
|
|
if (!sa)
|
|
return (char **)ERROR_PTR("sa not defined", procName, NULL);
|
|
|
|
array = sa->array;
|
|
if (pnalloc) *pnalloc = sa->nalloc;
|
|
if (pn) *pn = sa->n;
|
|
|
|
return array;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayGetString()
|
|
*
|
|
* Input: sarray
|
|
* index (to the index-th string)
|
|
* copyflag (L_NOCOPY or L_COPY)
|
|
* Return: string, or null on error
|
|
*
|
|
* Notes:
|
|
* (1) Legacy usage decrees that we always use 0 to get the
|
|
* pointer to the string itself, and 1 to get a copy of
|
|
* the string.
|
|
* (2) See usage comments at the top of this file.
|
|
* (3) To get a pointer to the string itself, use for copyflag:
|
|
* L_NOCOPY or 0 or FALSE
|
|
* To get a copy of the string, use for copyflag:
|
|
* L_COPY or 1 or TRUE
|
|
* The const values of L_NOCOPY and L_COPY are guaranteed not
|
|
* to change.
|
|
*/
|
|
char *
|
|
sarrayGetString(SARRAY *sa,
|
|
l_int32 index,
|
|
l_int32 copyflag)
|
|
{
|
|
PROCNAME("sarrayGetString");
|
|
|
|
if (!sa)
|
|
return (char *)ERROR_PTR("sa not defined", procName, NULL);
|
|
if (index < 0 || index >= sa->n)
|
|
return (char *)ERROR_PTR("index not valid", procName, NULL);
|
|
if (copyflag != L_NOCOPY && copyflag != L_COPY)
|
|
return (char *)ERROR_PTR("invalid copyflag", procName, NULL);
|
|
|
|
if (copyflag == L_NOCOPY)
|
|
return sa->array[index];
|
|
else /* L_COPY */
|
|
return stringNew(sa->array[index]);
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayGetRefCount()
|
|
*
|
|
* Input: sarray
|
|
* Return: refcount, or UNDEF on error
|
|
*/
|
|
l_int32
|
|
sarrayGetRefcount(SARRAY *sa)
|
|
{
|
|
PROCNAME("sarrayGetRefcount");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, UNDEF);
|
|
return sa->refcount;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayChangeRefCount()
|
|
*
|
|
* Input: sarray
|
|
* delta (change to be applied)
|
|
* Return: 0 if OK, 1 on error
|
|
*/
|
|
l_int32
|
|
sarrayChangeRefcount(SARRAY *sa,
|
|
l_int32 delta)
|
|
{
|
|
PROCNAME("sarrayChangeRefcount");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, UNDEF);
|
|
sa->refcount += delta;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Conversion to string *
|
|
*----------------------------------------------------------------------*/
|
|
/*!
|
|
* sarrayToString()
|
|
*
|
|
* Input: sarray
|
|
* addnlflag (flag: 0 adds nothing to each substring
|
|
* 1 adds '\n' to each substring
|
|
* 2 adds ' ' to each substring)
|
|
* Return: dest string, or null on error
|
|
*
|
|
* Notes:
|
|
* (1) Concatenates all the strings in the sarray, preserving
|
|
* all white space.
|
|
* (2) If addnlflag != 0, adds either a '\n' or a ' ' after
|
|
* each substring.
|
|
* (3) This function was NOT implemented as:
|
|
* for (i = 0; i < n; i++)
|
|
* strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
|
|
* Do you see why?
|
|
*/
|
|
char *
|
|
sarrayToString(SARRAY *sa,
|
|
l_int32 addnlflag)
|
|
{
|
|
PROCNAME("sarrayToString");
|
|
|
|
if (!sa)
|
|
return (char *)ERROR_PTR("sa not defined", procName, NULL);
|
|
|
|
return sarrayToStringRange(sa, 0, 0, addnlflag);
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayToStringRange()
|
|
*
|
|
* Input: sarray
|
|
* first (index of first string to use; starts with 0)
|
|
* nstrings (number of strings to append into the result; use
|
|
* 0 to append to the end of the sarray)
|
|
* addnlflag (flag: 0 adds nothing to each substring
|
|
* 1 adds '\n' to each substring
|
|
* 2 adds ' ' to each substring)
|
|
* Return: dest string, or null on error
|
|
*
|
|
* Notes:
|
|
* (1) Concatenates the specified strings inthe sarray, preserving
|
|
* all white space.
|
|
* (2) If addnlflag != 0, adds either a '\n' or a ' ' after
|
|
* each substring.
|
|
* (3) If the sarray is empty, this returns a string with just
|
|
* the character corresponding to @addnlflag.
|
|
*/
|
|
char *
|
|
sarrayToStringRange(SARRAY *sa,
|
|
l_int32 first,
|
|
l_int32 nstrings,
|
|
l_int32 addnlflag)
|
|
{
|
|
char *dest, *src, *str;
|
|
l_int32 n, i, last, size, index, len;
|
|
|
|
PROCNAME("sarrayToStringRange");
|
|
|
|
if (!sa)
|
|
return (char *)ERROR_PTR("sa not defined", procName, NULL);
|
|
if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2)
|
|
return (char *)ERROR_PTR("invalid addnlflag", procName, NULL);
|
|
|
|
n = sarrayGetCount(sa);
|
|
|
|
/* Empty sa; return char corresponding to addnlflag only */
|
|
if (n == 0) {
|
|
if (first == 0) {
|
|
if (addnlflag == 0)
|
|
return stringNew("");
|
|
if (addnlflag == 1)
|
|
return stringNew("\n");
|
|
else /* addnlflag == 2) */
|
|
return stringNew(" ");
|
|
}
|
|
else
|
|
return (char *)ERROR_PTR("first not valid", procName, NULL);
|
|
}
|
|
|
|
if (first < 0 || first >= n)
|
|
return (char *)ERROR_PTR("first not valid", procName, NULL);
|
|
if (nstrings == 0 || (nstrings > n - first))
|
|
nstrings = n - first; /* no overflow */
|
|
last = first + nstrings - 1;
|
|
|
|
size = 0;
|
|
for (i = first; i <= last; i++) {
|
|
if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
|
|
return (char *)ERROR_PTR("str not found", procName, NULL);
|
|
size += strlen(str) + 2;
|
|
}
|
|
|
|
if ((dest = (char *)CALLOC(size + 1, sizeof(char))) == NULL)
|
|
return (char *)ERROR_PTR("dest not made", procName, NULL);
|
|
|
|
index = 0;
|
|
for (i = first; i <= last; i++) {
|
|
src = sarrayGetString(sa, i, L_NOCOPY);
|
|
len = strlen(src);
|
|
memcpy(dest + index, src, len);
|
|
index += len;
|
|
if (addnlflag == 1) {
|
|
dest[index] = '\n';
|
|
index++;
|
|
}
|
|
else if (addnlflag == 2) {
|
|
dest[index] = ' ';
|
|
index++;
|
|
}
|
|
}
|
|
|
|
return dest;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Concatenate 2 sarrays *
|
|
*----------------------------------------------------------------------*/
|
|
/*!
|
|
* sarrayConcatenate()
|
|
*
|
|
* Input: sa1 (to be added to)
|
|
* sa2 (append to sa1)
|
|
* Return: 0 if OK, 1 on error
|
|
*
|
|
* Notes:
|
|
* (1) Copies of the strings in sarray2 are added to sarray1.
|
|
*/
|
|
l_int32
|
|
sarrayConcatenate(SARRAY *sa1,
|
|
SARRAY *sa2)
|
|
{
|
|
char *str;
|
|
l_int32 n, i;
|
|
|
|
PROCNAME("sarrayConcatenate");
|
|
|
|
if (!sa1)
|
|
return ERROR_INT("sa1 not defined", procName, 1);
|
|
if (!sa2)
|
|
return ERROR_INT("sa2 not defined", procName, 1);
|
|
|
|
n = sarrayGetCount(sa2);
|
|
for (i = 0; i < n; i++) {
|
|
str = sarrayGetString(sa2, i, L_NOCOPY);
|
|
sarrayAddString(sa1, str, L_COPY);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayAppendRange()
|
|
*
|
|
* Input: sa1 (to be added to)
|
|
* sa2 (append specified range of strings in sa2 to sa1)
|
|
* start (index of first string of sa2 to append)
|
|
* end (index of last string of sa2 to append)
|
|
* Return: 0 if OK, 1 on error
|
|
*
|
|
* Notes:
|
|
* (1) Copies of the strings in sarray2 are added to sarray1.
|
|
* (2) The [start ... end] range is truncated if necessary.
|
|
*/
|
|
l_int32
|
|
sarrayAppendRange(SARRAY *sa1,
|
|
SARRAY *sa2,
|
|
l_int32 start,
|
|
l_int32 end)
|
|
{
|
|
char *str;
|
|
l_int32 n, i;
|
|
|
|
PROCNAME("sarrayAppendRange");
|
|
|
|
if (!sa1)
|
|
return ERROR_INT("sa1 not defined", procName, 1);
|
|
if (!sa2)
|
|
return ERROR_INT("sa2 not defined", procName, 1);
|
|
if (start < 0)
|
|
start = 0;
|
|
n = sarrayGetCount(sa2);
|
|
if (end >= n)
|
|
end = n - 1;
|
|
if (start > end)
|
|
return ERROR_INT("start > end", procName, 1);
|
|
|
|
for (i = start; i <= end; i++) {
|
|
str = sarrayGetString(sa2, i, L_NOCOPY);
|
|
sarrayAddString(sa1, str, L_COPY);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Pad an sarray to be the same size as another sarray *
|
|
*----------------------------------------------------------------------*/
|
|
/*!
|
|
* sarrayPadToSameSize()
|
|
*
|
|
* Input: sa1, sa2
|
|
* padstring
|
|
* Return: 0 if OK, 1 on error
|
|
*
|
|
* Notes:
|
|
* (1) If two sarrays have different size, this adds enough
|
|
* instances of @padstring to the smaller so that they are
|
|
* the same size. It is useful when two or more sarrays
|
|
* are being sequenced in parallel, and it is necessary to
|
|
* find a valid string at each index.
|
|
*/
|
|
l_int32
|
|
sarrayPadToSameSize(SARRAY *sa1,
|
|
SARRAY *sa2,
|
|
char *padstring)
|
|
{
|
|
l_int32 i, n1, n2;
|
|
|
|
PROCNAME("sarrayPadToSameSize");
|
|
|
|
if (!sa1 || !sa2)
|
|
return ERROR_INT("both sa1 and sa2 not defined", procName, 1);
|
|
|
|
n1 = sarrayGetCount(sa1);
|
|
n2 = sarrayGetCount(sa2);
|
|
if (n1 < n2) {
|
|
for (i = n1; i < n2; i++)
|
|
sarrayAddString(sa1, padstring, L_COPY);
|
|
}
|
|
else if (n1 > n2) {
|
|
for (i = n2; i < n1; i++)
|
|
sarrayAddString(sa2, padstring, L_COPY);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Convert word sarray to line sarray *
|
|
*----------------------------------------------------------------------*/
|
|
/*!
|
|
* sarrayConvertWordsToLines()
|
|
*
|
|
* Input: sa (sa of individual words)
|
|
* linesize (max num of chars in each line)
|
|
* Return: saout (sa of formatted lines), or null on error
|
|
*
|
|
* This is useful for re-typesetting text to a specific maximum
|
|
* line length. The individual words in the input sarray
|
|
* are concatenated into textlines. An input word string of zero
|
|
* length is taken to be a paragraph separator. Each time
|
|
* such a string is found, the current line is ended and
|
|
* a new line is also produced that contains just the
|
|
* string of zero length (""). When the output sarray
|
|
* of lines is eventually converted to a string with newlines
|
|
* (typically) appended to each line string, the empty
|
|
* strings are just converted to newlines, producing the visible
|
|
* paragraph separation.
|
|
*
|
|
* What happens when a word is larger than linesize?
|
|
* We write it out as a single line anyway! Words preceding
|
|
* or following this long word are placed on lines preceding
|
|
* or following the line with the long word. Why this choice?
|
|
* Long "words" found in text documents are typically URLs, and
|
|
* it's often desirable not to put newlines in the middle of a URL.
|
|
* The text display program (e.g., text editor) will typically
|
|
* wrap the long "word" to fit in the window.
|
|
*/
|
|
SARRAY *
|
|
sarrayConvertWordsToLines(SARRAY *sa,
|
|
l_int32 linesize)
|
|
{
|
|
char *wd, *strl;
|
|
char emptystring[] = "";
|
|
l_int32 n, i, len, totlen;
|
|
SARRAY *sal, *saout;
|
|
|
|
PROCNAME("sarrayConvertWordsToLines");
|
|
|
|
if (!sa)
|
|
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
|
|
|
|
if ((saout = sarrayCreate(0)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("saout not defined", procName, NULL);
|
|
|
|
n = sarrayGetCount(sa);
|
|
totlen = 0;
|
|
sal = NULL;
|
|
for (i = 0; i < n; i++) {
|
|
if (!sal) {
|
|
if ((sal = sarrayCreate(0)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
|
|
}
|
|
wd = sarrayGetString(sa, i, L_NOCOPY);
|
|
len = strlen(wd);
|
|
if (len == 0) { /* end of paragraph: end line & insert blank line */
|
|
if (totlen > 0) {
|
|
strl = sarrayToString(sal, 2);
|
|
sarrayAddString(saout, strl, L_INSERT);
|
|
}
|
|
sarrayAddString(saout, emptystring, L_COPY);
|
|
sarrayDestroy(&sal);
|
|
totlen = 0;
|
|
}
|
|
else if (totlen == 0 && len + 1 > linesize) { /* long word! */
|
|
sarrayAddString(saout, wd, L_COPY); /* copy to one line */
|
|
}
|
|
else if (totlen + len + 1 > linesize) { /* end line & start new one */
|
|
strl = sarrayToString(sal, 2);
|
|
sarrayAddString(saout, strl, L_INSERT);
|
|
sarrayDestroy(&sal);
|
|
if ((sal = sarrayCreate(0)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
|
|
sarrayAddString(sal, wd, L_COPY);
|
|
totlen = len + 1;
|
|
}
|
|
else { /* add to current line */
|
|
sarrayAddString(sal, wd, L_COPY);
|
|
totlen += len + 1;
|
|
}
|
|
}
|
|
if (totlen > 0) { /* didn't end with blank line; output last line */
|
|
strl = sarrayToString(sal, 2);
|
|
sarrayAddString(saout, strl, L_INSERT);
|
|
sarrayDestroy(&sal);
|
|
}
|
|
|
|
return saout;
|
|
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Split string on separator list *
|
|
*----------------------------------------------------------------------*/
|
|
/*
|
|
* sarraySplitString()
|
|
*
|
|
* Input: sa (to append to; typically empty initially)
|
|
* str (string to split; not changed)
|
|
* separators (characters that split input string)
|
|
* Return: 0 if OK, 1 on error.
|
|
*
|
|
* Notes:
|
|
* (1) This uses strtokSafe(). See the notes there in utils.c.
|
|
*/
|
|
l_int32
|
|
sarraySplitString(SARRAY *sa,
|
|
const char *str,
|
|
const char *separators)
|
|
{
|
|
char *cstr, *substr, *saveptr;
|
|
|
|
PROCNAME("sarraySplitString");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
if (!str)
|
|
return ERROR_INT("str not defined", procName, 1);
|
|
if (!separators)
|
|
return ERROR_INT("separators not defined", procName, 1);
|
|
|
|
cstr = stringNew(str); /* preserves const-ness of input str */
|
|
substr = strtokSafe(cstr, separators, &saveptr);
|
|
if (substr)
|
|
sarrayAddString(sa, substr, L_INSERT);
|
|
while ((substr = strtokSafe(NULL, separators, &saveptr)))
|
|
sarrayAddString(sa, substr, L_INSERT);
|
|
FREE(cstr);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Filter sarray *
|
|
*----------------------------------------------------------------------*/
|
|
/*!
|
|
* sarraySelectBySubstring()
|
|
*
|
|
* Input: sain (input sarray)
|
|
* substr (<optional> substring for matching; can be NULL)
|
|
* Return: saout (output sarray, filtered with substring) or null on error
|
|
*
|
|
* Notes:
|
|
* (1) This selects all strings in sain that have substr as a substring.
|
|
* Note that we can't use strncmp() because we're looking for
|
|
* a match to the substring anywhere within each filename.
|
|
* (2) If substr == NULL, returns a copy of the sarray.
|
|
*/
|
|
SARRAY *
|
|
sarraySelectBySubstring(SARRAY *sain,
|
|
const char *substr)
|
|
{
|
|
char *str;
|
|
l_int32 n, i, offset, found;
|
|
SARRAY *saout;
|
|
|
|
PROCNAME("sarraySelectBySubstring");
|
|
|
|
if (!sain)
|
|
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
|
|
|
|
n = sarrayGetCount(sain);
|
|
if (!substr || n == 0)
|
|
return sarrayCopy(sain);
|
|
|
|
saout = sarrayCreate(n);
|
|
for (i = 0; i < n; i++) {
|
|
str = sarrayGetString(sain, i, L_NOCOPY);
|
|
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
|
|
strlen(substr), &offset, &found);
|
|
if (found)
|
|
sarrayAddString(saout, str, L_COPY);
|
|
}
|
|
|
|
return saout;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarraySelectByRange()
|
|
*
|
|
* Input: sain (input sarray)
|
|
* first (index of first string to be selected)
|
|
* last (index of last string to be selected; use 0 to go to the
|
|
* end of the sarray)
|
|
* Return: saout (output sarray), or null on error
|
|
*
|
|
* Notes:
|
|
* (1) This makes @saout consisting of copies of all strings in @sain
|
|
* in the index set [first ... last]. Use @last == 0 to get all
|
|
* strings from @first to the last string in the sarray.
|
|
*/
|
|
SARRAY *
|
|
sarraySelectByRange(SARRAY *sain,
|
|
l_int32 first,
|
|
l_int32 last)
|
|
{
|
|
char *str;
|
|
l_int32 n, i;
|
|
SARRAY *saout;
|
|
|
|
PROCNAME("sarraySelectByRange");
|
|
|
|
if (!sain)
|
|
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
|
|
if (first < 0) first = 0;
|
|
n = sarrayGetCount(sain);
|
|
if (last <= 0) last = n - 1;
|
|
if (last >= n) {
|
|
L_WARNING("@last > n - 1; setting to n - 1", procName);
|
|
last = n - 1;
|
|
}
|
|
if (first > last)
|
|
return (SARRAY *)ERROR_PTR("first must be >= last", procName, NULL);
|
|
|
|
saout = sarrayCreate(0);
|
|
for (i = first; i <= last; i++) {
|
|
str = sarrayGetString(sain, i, L_COPY);
|
|
sarrayAddString(saout, str, L_INSERT);
|
|
}
|
|
|
|
return saout;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayParseRange()
|
|
*
|
|
* Input: sa (input sarray)
|
|
* start (index to start range search)
|
|
* &actualstart (<return> index of actual start; may be > 'start')
|
|
* &end (<return> index of end)
|
|
* &newstart (<return> index of start of next range)
|
|
* substr (substring for matching at beginning of string)
|
|
* loc (byte offset within the string for the pattern; use
|
|
* -1 if the location does not matter);
|
|
* Return: 0 if valid range found; 1 otherwise
|
|
*
|
|
* Notes:
|
|
* (1) This finds the range of the next set of strings in SA,
|
|
* beginning the search at 'start', that does NOT have
|
|
* the substring 'substr' either at the indicated location
|
|
* in the string or anywhere in the string. The input
|
|
* variable 'loc' is the specified offset within the string;
|
|
* use -1 to indicate 'anywhere in the string'.
|
|
* (2) Always check the return value to verify that a valid range
|
|
* was found.
|
|
* (3) If a valid range is not found, the values of actstart,
|
|
* end and newstart are all set to the size of sa.
|
|
* (4) If this is the last valid range, newstart returns the value n.
|
|
* In use, this should be tested before calling the function.
|
|
* (5) Usage example. To find all the valid ranges in a file
|
|
* where the invalid lines begin with two dashes, copy each
|
|
* line in the file to a string in an sarray, and do:
|
|
* start = 0;
|
|
* while (!sarrayParseRange(sa, start, &actstart, &end, &start,
|
|
* "--", 0))
|
|
* fprintf(stderr, "start = %d, end = %d\n", actstart, end);
|
|
*/
|
|
l_int32
|
|
sarrayParseRange(SARRAY *sa,
|
|
l_int32 start,
|
|
l_int32 *pactualstart,
|
|
l_int32 *pend,
|
|
l_int32 *pnewstart,
|
|
const char *substr,
|
|
l_int32 loc)
|
|
{
|
|
char *str;
|
|
l_int32 n, i, offset, found;
|
|
|
|
PROCNAME("sarrayParseRange");
|
|
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
if (!pactualstart || !pend || !pnewstart)
|
|
return ERROR_INT("not all range addresses defined", procName, 1);
|
|
n = sarrayGetCount(sa);
|
|
*pactualstart = *pend = *pnewstart = n;
|
|
if (!substr)
|
|
return ERROR_INT("substr not defined", procName, 1);
|
|
|
|
/* Look for the first string without the marker */
|
|
if (start < 0 || start >= n)
|
|
return 1;
|
|
for (i = start; i < n; i++) {
|
|
str = sarrayGetString(sa, i, L_NOCOPY);
|
|
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
|
|
strlen(substr), &offset, &found);
|
|
if (loc < 0) {
|
|
if (!found) break;
|
|
} else {
|
|
if (!found || offset != loc) break;
|
|
}
|
|
}
|
|
start = i;
|
|
if (i == n) /* couldn't get started */
|
|
return 1;
|
|
|
|
/* Look for the last string without the marker */
|
|
*pactualstart = start;
|
|
for (i = start + 1; i < n; i++) {
|
|
str = sarrayGetString(sa, i, L_NOCOPY);
|
|
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
|
|
strlen(substr), &offset, &found);
|
|
if (loc < 0) {
|
|
if (found) break;
|
|
} else {
|
|
if (found && offset == loc) break;
|
|
}
|
|
}
|
|
*pend = i - 1;
|
|
start = i;
|
|
if (i == n) /* no further range */
|
|
return 0;
|
|
|
|
/* Look for the first string after *pend without the marker.
|
|
* This will start the next run of strings, if it exists. */
|
|
for (i = start; i < n; i++) {
|
|
str = sarrayGetString(sa, i, L_NOCOPY);
|
|
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
|
|
strlen(substr), &offset, &found);
|
|
if (loc < 0) {
|
|
if (!found) break;
|
|
} else {
|
|
if (!found || offset != loc) break;
|
|
}
|
|
}
|
|
if (i < n)
|
|
*pnewstart = i;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Sort *
|
|
*----------------------------------------------------------------------*/
|
|
/*!
|
|
* sarraySort()
|
|
*
|
|
* Input: saout (output sarray; can be NULL or equal to sain)
|
|
* sain (input sarray)
|
|
* sortorder (L_SORT_INCREASING or L_SORT_DECREASING)
|
|
* Return: saout (output sarray, sorted by ascii value), or null on error
|
|
*
|
|
* Notes:
|
|
* (1) Set saout = sain for in-place; otherwise, set naout = NULL.
|
|
* (2) Shell sort, modified from K&R, 2nd edition, p.62.
|
|
* Slow but simple O(n logn) sort.
|
|
*/
|
|
SARRAY *
|
|
sarraySort(SARRAY *saout,
|
|
SARRAY *sain,
|
|
l_int32 sortorder)
|
|
{
|
|
char **array;
|
|
char *tmp;
|
|
l_int32 n, i, j, gap;
|
|
|
|
PROCNAME("sarraySort");
|
|
|
|
if (!sain)
|
|
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
|
|
|
|
/* Make saout if necessary; otherwise do in-place */
|
|
if (!saout)
|
|
saout = sarrayCopy(sain);
|
|
else if (sain != saout)
|
|
return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL);
|
|
array = saout->array; /* operate directly on the array */
|
|
n = sarrayGetCount(saout);
|
|
|
|
/* Shell sort */
|
|
for (gap = n/2; gap > 0; gap = gap / 2) {
|
|
for (i = gap; i < n; i++) {
|
|
for (j = i - gap; j >= 0; j -= gap) {
|
|
if ((sortorder == L_SORT_INCREASING &&
|
|
stringCompareLexical(array[j], array[j + gap])) ||
|
|
(sortorder == L_SORT_DECREASING &&
|
|
stringCompareLexical(array[j + gap], array[j])))
|
|
{
|
|
tmp = array[j];
|
|
array[j] = array[j + gap];
|
|
array[j + gap] = tmp;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return saout;
|
|
}
|
|
|
|
|
|
/*!
|
|
* stringCompareLexical()
|
|
*
|
|
* Input: str1
|
|
* str2
|
|
* Return: 1 if str1 > str2 (lexically); 0 otherwise
|
|
*
|
|
* Notes:
|
|
* (1) If the lexical values are identical, return a 0, to
|
|
* indicate that no swapping is required to sort the strings.
|
|
*/
|
|
l_int32
|
|
stringCompareLexical(const char *str1,
|
|
const char *str2)
|
|
{
|
|
l_int32 i, len1, len2, len;
|
|
|
|
PROCNAME("sarrayCompareLexical");
|
|
|
|
if (!str1)
|
|
return ERROR_INT("str1 not defined", procName, 1);
|
|
if (!str2)
|
|
return ERROR_INT("str2 not defined", procName, 1);
|
|
|
|
len1 = strlen(str1);
|
|
len2 = strlen(str2);
|
|
len = L_MIN(len1, len2);
|
|
|
|
for (i = 0; i < len; i++) {
|
|
if (str1[i] == str2[i])
|
|
continue;
|
|
if (str1[i] > str2[i])
|
|
return 1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
if (len1 > len2)
|
|
return 1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------*
|
|
* Serialize for I/O *
|
|
*----------------------------------------------------------------------*/
|
|
/*!
|
|
* sarrayRead()
|
|
*
|
|
* Input: filename
|
|
* Return: sarray, or null on error
|
|
*/
|
|
SARRAY *
|
|
sarrayRead(const char *filename)
|
|
{
|
|
FILE *fp;
|
|
SARRAY *sa;
|
|
|
|
PROCNAME("sarrayRead");
|
|
|
|
if (!filename)
|
|
return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL);
|
|
|
|
if ((fp = fopenReadStream(filename)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
|
|
|
|
if ((sa = sarrayReadStream(fp)) == NULL) {
|
|
fclose(fp);
|
|
return (SARRAY *)ERROR_PTR("sa not read", procName, NULL);
|
|
}
|
|
|
|
fclose(fp);
|
|
return sa;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayReadStream()
|
|
*
|
|
* Input: stream
|
|
* Return: sarray, or null on error
|
|
*
|
|
* Notes:
|
|
* (1) We store the size of each string along with the string.
|
|
* (2) This allows a string to have embedded newlines. By reading
|
|
* the entire string, as determined by its size, we are
|
|
* not affected by any number of embedded newlines.
|
|
*/
|
|
SARRAY *
|
|
sarrayReadStream(FILE *fp)
|
|
{
|
|
char *stringbuf;
|
|
l_int32 i, n, size, index, bufsize, ret, version;
|
|
SARRAY *sa;
|
|
|
|
PROCNAME("sarrayReadStream");
|
|
|
|
if (!fp)
|
|
return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL);
|
|
|
|
ret = fscanf(fp, "\nSarray Version %d\n", &version);
|
|
if (ret != 1)
|
|
return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL);
|
|
if (version != SARRAY_VERSION_NUMBER)
|
|
return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL);
|
|
fscanf(fp, "Number of strings = %d\n", &n);
|
|
|
|
if ((sa = sarrayCreate(n)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
|
|
bufsize = L_BUF_SIZE + 1;
|
|
if ((stringbuf = (char *)CALLOC(bufsize, sizeof(char))) == NULL)
|
|
return (SARRAY *)ERROR_PTR("stringbuf not made", procName, NULL);
|
|
|
|
for (i = 0; i < n; i++) {
|
|
/* Get the size of the stored string */
|
|
fscanf(fp, "%d[%d]:", &index, &size);
|
|
/* Expand the string buffer if necessary */
|
|
if (size > bufsize - 5) {
|
|
FREE(stringbuf);
|
|
bufsize = (l_int32)(1.5 * size);
|
|
stringbuf = (char *)CALLOC(bufsize, sizeof(char));
|
|
}
|
|
/* Read the stored string, plus leading spaces and trailing \n */
|
|
fread(stringbuf, 1, size + 3, fp);
|
|
/* Remove the \n that was added by sarrayWriteStream() */
|
|
stringbuf[size + 2] = '\0';
|
|
/* Copy it in, skipping the 2 leading spaces */
|
|
sarrayAddString(sa, stringbuf + 2, L_COPY);
|
|
}
|
|
fscanf(fp, "\n");
|
|
|
|
FREE(stringbuf);
|
|
return sa;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayWrite()
|
|
*
|
|
* Input: filename
|
|
* sarray
|
|
* Return: 0 if OK; 1 on error
|
|
*/
|
|
l_int32
|
|
sarrayWrite(const char *filename,
|
|
SARRAY *sa)
|
|
{
|
|
FILE *fp;
|
|
|
|
PROCNAME("sarrayWrite");
|
|
|
|
if (!filename)
|
|
return ERROR_INT("filename not defined", procName, 1);
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
|
|
if ((fp = fopen(filename, "w")) == NULL)
|
|
return ERROR_INT("stream not opened", procName, 1);
|
|
|
|
if (sarrayWriteStream(fp, sa))
|
|
return ERROR_INT("sa not written to stream", procName, 1);
|
|
|
|
fclose(fp);
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayWriteStream()
|
|
*
|
|
* Input: stream
|
|
* sarray
|
|
* Returns 0 if OK; 1 on error
|
|
*
|
|
* Notes:
|
|
* (1) This appends a '\n' to each string, which is stripped
|
|
* off by sarrayReadStream().
|
|
*/
|
|
l_int32
|
|
sarrayWriteStream(FILE *fp,
|
|
SARRAY *sa)
|
|
{
|
|
l_int32 i, n, len;
|
|
|
|
PROCNAME("sarrayWriteStream");
|
|
|
|
if (!fp)
|
|
return ERROR_INT("stream not defined", procName, 1);
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
|
|
n = sarrayGetCount(sa);
|
|
fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
|
|
fprintf(fp, "Number of strings = %d\n", n);
|
|
for (i = 0; i < n; i++) {
|
|
len = strlen(sa->array[i]);
|
|
fprintf(fp, " %d[%d]: %s\n", i, len, sa->array[i]);
|
|
}
|
|
fprintf(fp, "\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*!
|
|
* sarrayAppend()
|
|
*
|
|
* Input: filename
|
|
* sarray
|
|
* Return: 0 if OK; 1 on error
|
|
*/
|
|
l_int32
|
|
sarrayAppend(const char *filename,
|
|
SARRAY *sa)
|
|
{
|
|
FILE *fp;
|
|
|
|
PROCNAME("sarrayAppend");
|
|
|
|
if (!filename)
|
|
return ERROR_INT("filename not defined", procName, 1);
|
|
if (!sa)
|
|
return ERROR_INT("sa not defined", procName, 1);
|
|
|
|
if ((fp = fopen(filename, "a")) == NULL)
|
|
return ERROR_INT("stream not opened", procName, 1);
|
|
|
|
if (sarrayWriteStream(fp, sa))
|
|
return ERROR_INT("sa not appended to stream", procName, 1);
|
|
|
|
fclose(fp);
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*---------------------------------------------------------------------*
|
|
* Directory filenames *
|
|
*---------------------------------------------------------------------*/
|
|
/*!
|
|
* getNumberedPathnamesInDirectory()
|
|
*
|
|
* Input: directory name
|
|
* substr (<optional> substring filter on filenames; can be NULL)
|
|
* numpre (number of characters in name before number)
|
|
* numpost (number of characters in name after number)
|
|
* maxnum (only consider page numbers up to this value)
|
|
* Return: sarray of sorted pathnames, or NULL on error
|
|
*
|
|
* Notes:
|
|
* (1) Returns the full pathnames of the numbered filenames in
|
|
* the directory. The number in the filename is the index
|
|
* into the sarray. For indices for which there are no filenames,
|
|
* an empty string ("") is placed into the sarray.
|
|
* This makes reading numbered files very simple. For example,
|
|
* the image whose filename includes number N can be retrieved using
|
|
* pixReadIndexed(sa, N);
|
|
* (2) If @substr is not NULL, only filenames that contain
|
|
* the substring can be included. If @substr is NULL,
|
|
* all matching filenames are used.
|
|
* (3) If no numbered files are found, it returns an empty sarray,
|
|
* with no initialized strings.
|
|
* (4) It is assumed that the page number is contained within
|
|
* the basename (the filename without directory or extension).
|
|
* @numpre is the number of characters in the basename
|
|
* preceeding the actual page number; @numpost is the number
|
|
* following the page number.
|
|
* (5) To use a O(n) matching algorithm, the largest page number
|
|
* is found and two internal arrays of this size are created.
|
|
* This maximum is constrained not to exceed @maxsum,
|
|
* to make sure that an unrealistically large number is not
|
|
* accidentally used to determine the array sizes.
|
|
*/
|
|
SARRAY *
|
|
getNumberedPathnamesInDirectory(const char *dirname,
|
|
const char *substr,
|
|
l_int32 numpre,
|
|
l_int32 numpost,
|
|
l_int32 maxnum)
|
|
{
|
|
char *fname, *str;
|
|
l_int32 i, nfiles, num, index;
|
|
SARRAY *sa, *saout;
|
|
|
|
PROCNAME("getNumberedPathnamesInDirectory");
|
|
|
|
if (!dirname)
|
|
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
|
|
|
|
if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
|
|
if ((nfiles = sarrayGetCount(sa)) == 0)
|
|
return sarrayCreate(1);
|
|
|
|
/* Find the last file in the sorted array that has a number
|
|
* that (a) matches the count pattern and (b) does not
|
|
* exceed @maxnum. @maxnum sets an upper limit on the size
|
|
* of the sarray. */
|
|
num = 0;
|
|
for (i = nfiles - 1; i >= 0; i--) {
|
|
fname = sarrayGetString(sa, i, L_NOCOPY);
|
|
num = extractNumberFromFilename(fname, numpre, numpost);
|
|
if (num < 0) continue;
|
|
num = L_MIN(num + 1, maxnum);
|
|
break;
|
|
}
|
|
|
|
if (num <= 0) /* none found */
|
|
return sarrayCreate(1);
|
|
|
|
/* Insert pathnames into the output sarray.
|
|
* Ignore numbers that are out of the range of sarray. */
|
|
saout = sarrayCreateInitialized(num, (char *)"");
|
|
for (i = 0; i < nfiles; i++) {
|
|
fname = sarrayGetString(sa, i, L_NOCOPY);
|
|
index = extractNumberFromFilename(fname, numpre, numpost);
|
|
if (index < 0 || index >= num) continue;
|
|
str = sarrayGetString(saout, index, L_NOCOPY);
|
|
if (str[0] != '\0')
|
|
L_WARNING_INT("\n Multiple files with same number: %d",
|
|
procName, index);
|
|
sarrayReplaceString(saout, index, fname, L_COPY);
|
|
}
|
|
|
|
sarrayDestroy(&sa);
|
|
return saout;
|
|
}
|
|
|
|
|
|
/*!
|
|
* getSortedPathnamesInDirectory()
|
|
*
|
|
* Input: directory name
|
|
* substr (<optional> substring filter on filenames; can be NULL)
|
|
* firstpage (0-based)
|
|
* npages (use 0 for all to the end)
|
|
* Return: sarray of sorted pathnames, or NULL on error
|
|
*
|
|
* Notes:
|
|
* (1) If @substr is not NULL, only filenames that contain
|
|
* the substring can be returned. If @substr == NULL,
|
|
* none of the filenames are filtered out.
|
|
* (2) The files in the directory, after optional filtering by
|
|
* the substring, are lexically sorted in increasing order.
|
|
* The full pathnames are returned for the requested sequence.
|
|
* If no files are found after filtering, returns an empty sarray.
|
|
*/
|
|
SARRAY *
|
|
getSortedPathnamesInDirectory(const char *dirname,
|
|
const char *substr,
|
|
l_int32 firstpage,
|
|
l_int32 npages)
|
|
{
|
|
char *fname, *fullname;
|
|
l_int32 i, nfiles, lastpage;
|
|
SARRAY *sa, *safiles, *saout;
|
|
|
|
PROCNAME("getSortedPathnamesInDirectory");
|
|
|
|
if (!dirname)
|
|
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
|
|
|
|
if ((sa = getFilenamesInDirectory(dirname)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
|
|
safiles = sarraySelectBySubstring(sa, substr);
|
|
sarrayDestroy(&sa);
|
|
nfiles = sarrayGetCount(safiles);
|
|
if (nfiles == 0) {
|
|
L_WARNING("no files found", procName);
|
|
return safiles;
|
|
}
|
|
|
|
sarraySort(safiles, safiles, L_SORT_INCREASING);
|
|
|
|
firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1);
|
|
if (npages == 0)
|
|
npages = nfiles - firstpage;
|
|
lastpage = L_MIN(firstpage + npages - 1, nfiles - 1);
|
|
|
|
saout = sarrayCreate(lastpage - firstpage + 1);
|
|
for (i = firstpage; i <= lastpage; i++) {
|
|
fname = sarrayGetString(safiles, i, L_NOCOPY);
|
|
fullname = genPathname(dirname, fname);
|
|
sarrayAddString(saout, fullname, L_INSERT);
|
|
}
|
|
|
|
sarrayDestroy(&safiles);
|
|
return saout;
|
|
}
|
|
|
|
|
|
/*!
|
|
* getFilenamesInDirectory()
|
|
*
|
|
* Input: directory name
|
|
* Return: sarray of file names, or NULL on error
|
|
*
|
|
* Notes:
|
|
* (1) The versions compiled under unix and cygwin use the POSIX C
|
|
* library commands for handling directories. For windows,
|
|
* there is a separate implementation.
|
|
* (2) It returns an array of filename tails; i.e., only the part of
|
|
* the path after the last slash.
|
|
* (3) Use of the d_type field of dirent is not portable:
|
|
* "According to POSIX, the dirent structure contains a field
|
|
* char d_name[] of unspecified size, with at most NAME_MAX
|
|
* characters preceding the terminating null character. Use
|
|
* of other fields will harm the portability of your programs."
|
|
* (4) As a consequence of (3), we note several things:
|
|
* - MINGW doesn't have a d_type member.
|
|
* - Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
|
|
* for d_type from all files.
|
|
* On these systems, this function will return directories
|
|
* (except for '.' and '..', which are eliminated using
|
|
* the d_name field).
|
|
*/
|
|
|
|
#ifndef COMPILER_MSVC
|
|
|
|
SARRAY *
|
|
getFilenamesInDirectory(const char *dirname)
|
|
{
|
|
char *name;
|
|
l_int32 len;
|
|
SARRAY *safiles;
|
|
DIR *pdir;
|
|
struct dirent *pdirentry;
|
|
|
|
PROCNAME("getFilenamesInDirectory");
|
|
|
|
if (!dirname)
|
|
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
|
|
|
|
if ((safiles = sarrayCreate(0)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
|
|
if ((pdir = opendir(dirname)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL);
|
|
while ((pdirentry = readdir(pdir))) {
|
|
|
|
/* It's nice to ignore directories. For this it is necessary to
|
|
* define _BSD_SOURCE in the CC command, because the DT_DIR
|
|
* flag is non-standard. */
|
|
#if !defined(__MINGW32__) && !defined(_CYGWIN_ENVIRON) && !defined(__SOLARIS__)
|
|
if (pdirentry->d_type == DT_DIR)
|
|
continue;
|
|
#endif
|
|
|
|
/* Filter out "." and ".." if they're passed through */
|
|
name = pdirentry->d_name;
|
|
len = strlen(name);
|
|
if (len == 1 && name[len - 1] == '.') continue;
|
|
if (len == 2 && name[len - 1] == '.' && name[len - 2] == '.') continue;
|
|
sarrayAddString(safiles, name, L_COPY);
|
|
}
|
|
closedir(pdir);
|
|
|
|
return safiles;
|
|
}
|
|
|
|
#else /* COMPILER_MSVC */
|
|
|
|
/* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
|
|
#include <windows.h>
|
|
SARRAY *
|
|
getFilenamesInDirectory(const char *dirname)
|
|
{
|
|
char szDir[MAX_PATH];
|
|
char *tempname;
|
|
l_int32 dirlen;
|
|
HANDLE hFind = INVALID_HANDLE_VALUE;
|
|
SARRAY *safiles;
|
|
WIN32_FIND_DATAA ffd;
|
|
|
|
PROCNAME("getFilenamesInDirectory");
|
|
|
|
if (!dirname)
|
|
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
|
|
|
|
dirlen = strlen(dirname);
|
|
if (dirlen > (MAX_PATH - 3))
|
|
return (SARRAY *)ERROR_PTR("dirname is too long", procName, NULL);
|
|
|
|
if (stringFindSubstr(dirname, "/", NULL) > 0) {
|
|
tempname = stringReplaceEachSubstr(dirname, "/", "\\", NULL);
|
|
strncpy_s(szDir, sizeof(szDir), tempname, _TRUNCATE);
|
|
FREE(tempname);
|
|
}
|
|
else
|
|
strncpy_s(szDir, sizeof(szDir), dirname, _TRUNCATE);
|
|
strncat_s(szDir, sizeof(szDir), TEXT("\\*"), MAX_PATH - strlen(szDir) - 1);
|
|
|
|
if ((safiles = sarrayCreate(0)) == NULL)
|
|
return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
|
|
hFind = FindFirstFileA(szDir, &ffd);
|
|
if (INVALID_HANDLE_VALUE == hFind) {
|
|
sarrayDestroy(&safiles);
|
|
return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL);
|
|
}
|
|
|
|
while (FindNextFileA(hFind, &ffd) != 0) {
|
|
if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) /* skip dirs */
|
|
continue;
|
|
sarrayAddString(safiles, ffd.cFileName, L_COPY);
|
|
}
|
|
|
|
FindClose(hFind);
|
|
return safiles;
|
|
}
|
|
|
|
#endif /* COMPILER_MSVC */
|
|
|