Annotation of parser3/src/lib/cord/include/cord.h, revision 1.7

1.2       paf         1: /* 
                      2:  * Copyright (c) 1993-1994 by Xerox Corporation.  All rights reserved.
                      3:  *
                      4:  * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
                      5:  * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
                      6:  *
                      7:  * Permission is hereby granted to use or copy this program
                      8:  * for any purpose,  provided the above notices are retained on all copies.
                      9:  * Permission to modify the code and to distribute modified code is granted,
                     10:  * provided the above notices are retained, and a notice that the code was
                     11:  * modified is included with the above copyright notice.
                     12:  *
                     13:  * Author: Hans-J. Boehm (boehm@parc.xerox.com)
                     14:  */
                     15: /* Boehm, October 5, 1995 4:20 pm PDT */
                     16:  
                     17: /*
                     18:  * Cords are immutable character strings.  A number of operations
                     19:  * on long cords are much more efficient than their strings.h counterpart.
                     20:  * In particular, concatenation takes constant time independent of the length
                     21:  * of the arguments.  (Cords are represented as trees, with internal
                     22:  * nodes representing concatenation and leaves consisting of either C
                     23:  * strings or a functional description of the string.)
                     24:  *
                     25:  * The following are reasonable applications of cords.  They would perform
                     26:  * unacceptably if C strings were used:
                     27:  * - A compiler that produces assembly language output by repeatedly
                     28:  *   concatenating instructions onto a cord representing the output file.
                     29:  * - A text editor that converts the input file to a cord, and then
                     30:  *   performs editing operations by producing a new cord representing
                     31:  *   the file after echa character change (and keeping the old ones in an
                     32:  *   edit history)
                     33:  *
                     34:  * For optimal performance, cords should be built by
                     35:  * concatenating short sections.
                     36:  * This interface is designed for maximum compatibility with C strings.
                     37:  * ASCII NUL characters may be embedded in cords using CORD_from_fn.
                     38:  * This is handled correctly, but CORD_to_char_star will produce a string
                     39:  * with embedded NULs when given such a cord. 
                     40:  *
                     41:  * This interface is fairly big, largely for performance reasons.
                     42:  * The most basic constants and functions:
                     43:  *
                     44:  * CORD - the type of a cord;
                     45:  * CORD_EMPTY - empty cord;
                     46:  * CORD_len(cord) - length of a cord;
                     47:  * CORD_cat(cord1,cord2) - concatenation of two cords;
                     48:  * CORD_substr(cord, start, len) - substring (or subcord);
                     49:  * CORD_pos i;  CORD_FOR(i, cord) {  ... CORD_pos_fetch(i) ... } -
                     50:  *    examine each character in a cord.  CORD_pos_fetch(i) is the char.
                     51:  * CORD_fetch(int i) - Retrieve i'th character (slowly).
                     52:  * CORD_cmp(cord1, cord2) - compare two cords.
                     53:  * CORD_from_file(FILE * f) - turn a read-only file into a cord.
                     54:  * CORD_to_char_star(cord) - convert to C string.
                     55:  *   (Non-NULL C constant strings are cords.)
                     56:  * CORD_printf (etc.) - cord version of printf. Use %r for cords.
                     57:  */
                     58: # ifndef CORD_H
                     59: 
                     60: # define CORD_H
                     61: # include <stddef.h>
                     62: # include <stdio.h>
                     63: /* Cords have type const char *.  This is cheating quite a bit, and not        */
                     64: /* 100% portable.  But it means that nonempty character string         */
                     65: /* constants may be used as cords directly, provided the string is     */
                     66: /* never modified in place.  The empty cord is represented by, and     */
                     67: /* can be written as, 0.                                               */
                     68: 
                     69: typedef const char * CORD;
                     70: 
                     71: /* An empty cord is always represented as nil  */
                     72: # define CORD_EMPTY 0
                     73: 
                     74: /* Is a nonempty cord represented as a C string? */
                     75: #define CORD_IS_STRING(s) (*(s) != '\0')
                     76: 
1.6       misha      77: /* Allows struct Concatenation modification on merge in CORD_cat,
                     78:  thus all source Concatenation structs must be prepared for this */
                     79: #define CORD_CAT_OPTIMIZATION
                     80: 
                     81: /* Caches CORD_chars result to avoide useless allocations */
                     82: #define CORD_CHARS_CACHE
                     83: 
1.2       paf        84: /* Concatenate two cords.  If the arguments are C strings, they may    */
                     85: /* not be subsequently altered.                                                */
                     86: CORD CORD_cat(CORD x, CORD y);
                     87: 
                     88: /* Concatenate a cord and a C string with known length.  Except for the        */
                     89: /* empty string case, this is a special case of CORD_cat.  Since the   */
                     90: /* length is known, it can be faster.                                  */
                     91: /* The string y is shared with the resulting CORD.  Hence it should    */
                     92: /* not be altered by the caller.                                       */
                     93: /* PAF@design.ru: 
                     94:        but there is a ~bug in case (0, "123", 2), it returns "123", and later appends after '3', not '2'.
                     95:        so BEWARE: NOT USE IT THAT WAY
                     96:        and changing 'leny' convention: now, if it's 0, then function does leny=strlen(y)
                     97: */
                     98: CORD CORD_cat_char_star(CORD x, const char * y, size_t leny);
                     99: 
1.6       misha     100: #ifdef CORD_CAT_OPTIMIZATION
                    101: void CORD_concatenation_protect(CORD x);
                    102: CORD CORD_cat_optimized(CORD x, CORD y);
                    103: CORD CORD_cat_char_star_optimized(CORD x, const char * y, size_t leny);
                    104: #endif
                    105: 
1.2       paf       106: /* Compute the length of a cord */
                    107: size_t CORD_len(CORD x);
                    108: 
                    109: /* Cords may be represented by functions defining the ith character */
                    110: typedef char (* CORD_fn)(size_t i, void * client_data);
                    111: 
                    112: /* Turn a functional description into a cord.  */
                    113: CORD CORD_from_fn(CORD_fn fn, void * client_data, size_t len);
                    114: 
1.3       paf       115: /* Turn a functional description into a cord, only conjunction&func.   */
                    116: CORD CORD_from_fn_gen(CORD_fn fn, void * client_data, size_t len);
                    117: 
1.2       paf       118: /* Return the substring (subcord really) of x with length at most n,   */
                    119: /* starting at position i.  (The initial character has position 0.)    */
                    120: CORD CORD_substr(CORD x, size_t i, size_t n);
1.7     ! misha     121: CORD CORD_substr_checked(CORD x, size_t i, size_t n);
1.2       paf       122: 
                    123: /* Return the argument, but rebalanced to allow more efficient         */
                    124: /* character retrieval, substring operations, and comparisons.         */
                    125: /* This is useful only for cords that were built using repeated        */
                    126: /* concatenation.  Guarantees log time access to the result, unless    */
                    127: /* x was obtained through a large number of repeated substring ops     */
                    128: /* or the embedded functional descriptions take longer to evaluate.    */
                    129: /* May reallocate significant parts of the cord.  The argument is not  */
                    130: /* modified; only the result is balanced.                              */
                    131: CORD CORD_balance(CORD x);
                    132: 
                    133: /* The following traverse a cord by applying a function to each        */
                    134: /* character.  This is occasionally appropriate, especially where      */
                    135: /* speed is crucial.  But, since C doesn't have nested functions,      */
                    136: /* clients of this sort of traversal are clumsy to write.  Consider    */
                    137: /* the functions that operate on cord positions instead.               */
                    138: 
                    139: /* Function to iteratively apply to individual characters in cord.     */
                    140: typedef int (* CORD_iter_fn)(char c, void * client_data);
                    141: 
1.3       paf       142: /* Function to iteratively apply to individual block in cord.  */
                    143: typedef int (* CORD_block_iter_fn)(char c, size_t len, void* client_data);
                    144: 
                    145: 
1.2       paf       146: /* Function to apply to substrings of a cord.  Each substring is a     */
                    147: /* a C character string, not a general cord.                           */
                    148: typedef int (* CORD_batched_iter_fn)(const char * s, void * client_data);
                    149: # define CORD_NO_FN ((CORD_batched_iter_fn)0)
                    150: 
                    151: /* Apply f1 to each character in the cord, in ascending order,         */
                    152: /* starting at position i. If                                          */
                    153: /* f2 is not CORD_NO_FN, then multiple calls to f1 may be replaced by  */
                    154: /* a single call to f2.  The parameter f2 is provided only to allow    */
                    155: /* some optimization by the client.  This terminates when the right    */
                    156: /* end of this string is reached, or when f1 or f2 return != 0.  In the        */
                    157: /* latter case CORD_iter returns != 0.  Otherwise it returns 0.                */
                    158: /* The specified value of i must be < CORD_len(x).                     */
                    159: int CORD_iter5(CORD x, size_t i, CORD_iter_fn f1,
                    160:               CORD_batched_iter_fn f2, void * client_data);
                    161: 
1.3       paf       162: /* iterate over function block in cord */
                    163: int CORD_block_iter(CORD x, size_t i, CORD_block_iter_fn f1, void * client_data);
                    164: 
1.2       paf       165: /* A simpler version that starts at 0, and without f2: */
                    166: int CORD_iter(CORD x, CORD_iter_fn f1, void * client_data);
                    167: # define CORD_iter(x, f1, cd) CORD_iter5(x, 0, f1, CORD_NO_FN, cd)
                    168: 
                    169: /* Similar to CORD_iter5, but end-to-beginning.        No provisions for       */
                    170: /* CORD_batched_iter_fn.                                               */
                    171: int CORD_riter4(CORD x, size_t i, CORD_iter_fn f1, void * client_data);
                    172: 
                    173: /* A simpler version that starts at the end:   */
                    174: int CORD_riter(CORD x, CORD_iter_fn f1, void * client_data);
                    175: 
                    176: /* Functions that operate on cord positions.  The easy way to traverse */
                    177: /* cords.  A cord position is logically a pair consisting of a cord    */
                    178: /* and an index into that cord.  But it is much faster to retrieve a   */
                    179: /* charcter based on a position than on an index.  Unfortunately,      */
                    180: /* positions are big (order of a few 100 bytes), so allocate them with */
                    181: /* caution.                                                            */
                    182: /* Things in cord_pos.h should be treated as opaque, except as         */
                    183: /* described below.  Also note that                                    */
                    184: /* CORD_pos_fetch, CORD_next and CORD_prev have both macro and function        */
                    185: /* definitions.  The former may evaluate their argument more than once. */
                    186: # include "private/cord_pos.h"
                    187: 
                    188: /*
                    189:        Visible definitions from above:
                    190:        
                    191:        typedef <OPAQUE but fairly big> CORD_pos[1];
                    192:        
                    193:        * Extract the cord from a position:
                    194:        CORD CORD_pos_to_cord(CORD_pos p);
                    195:        
                    196:        * Extract the current index from a position:
                    197:        size_t CORD_pos_to_index(CORD_pos p);
                    198:        
                    199:        * Fetch the character located at the given position:
                    200:        char CORD_pos_fetch(CORD_pos p);
                    201:        
                    202:        * Initialize the position to refer to the given cord and index.
                    203:        * Note that this is the most expensive function on positions:
                    204:        void CORD_set_pos(CORD_pos p, CORD x, size_t i);
                    205:        
                    206:        * Advance the position to the next character.
                    207:        * P must be initialized and valid.
                    208:        * Invalidates p if past end:
                    209:        void CORD_next(CORD_pos p);
                    210:        
                    211:        * Move the position to the preceding character.
                    212:        * P must be initialized and valid.
                    213:        * Invalidates p if past beginning:
                    214:        void CORD_prev(CORD_pos p);
                    215:        
                    216:        * Is the position valid, i.e. inside the cord?
                    217:        int CORD_pos_valid(CORD_pos p);
                    218: */
                    219: # define CORD_FOR(pos, cord) \
                    220:     for (CORD_set_pos(pos, cord, 0); CORD_pos_valid(pos); CORD_next(pos))
                    221: 
                    222:                        
                    223: /* An out of memory handler to call.  May be supplied by client.       */
                    224: /* Must not return.                                                    */
                    225: extern void (* CORD_oom_fn)(void);
                    226: 
                    227: /* Dump the representation of x to stdout in an implementation defined */
                    228: /* manner.  Intended for debugging only.                               */
                    229: void CORD_dump(CORD x);
                    230: 
                    231: /* The following could easily be implemented by the client.  They are  */
                    232: /* provided in cordxtra.c for convenience.                             */
                    233: 
                    234: /* Concatenate a character to the end of a cord.       */
                    235: CORD CORD_cat_char(CORD x, char c);
                    236: /* Concatenate n cords.        */
                    237: CORD CORD_catn(int n, /* CORD */ ...);
                    238: 
                    239: /* Return the character in CORD_substr(x, i, 1)        */
                    240: char CORD_fetch(CORD x, size_t i);
                    241: 
                    242: /* Return < 0, 0, or > 0, depending on whether x < y, x = y, x > y     */
                    243: int CORD_cmp(CORD x, CORD y);
                    244: 
                    245: /* A generalization that takes both starting positions for the                 */
                    246: /* comparison, and a limit on the number of characters to be compared. */
                    247: int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len);
                    248: 
                    249: /* Find the first occurrence of s in x at position start or later.     */
                    250: /* Return the position of the first character of s in x, or            */
                    251: /* CORD_NOT_FOUND if there is none.                                    */
                    252: size_t CORD_str(CORD x, size_t start, CORD s);
                    253: 
                    254: /* Return a cord consisting of i copies of (possibly NUL) c.  Dangerous        */
                    255: /* in conjunction with CORD_to_char_star.                              */
                    256: /* The resulting representation takes constant space, independent of i.        */
                    257: CORD CORD_chars(char c, size_t i);
                    258: # define CORD_nul(i) CORD_chars('\0', (i))
                    259: 
                    260: /* Turn a file into cord.  The file must be seekable.  Its contents    */
                    261: /* must remain constant.  The file may be accessed as an immediate     */
                    262: /* result of this call and/or as a result of subsequent accesses to    */
                    263: /* the cord.  Short files are likely to be immediately read, but       */
                    264: /* long files are likely to be read on demand, possibly relying on     */
                    265: /* stdio for buffering.                                                        */
                    266: /* We must have exclusive access to the descriptor f, i.e. we may      */
                    267: /* read it at any time, and expect the file pointer to be              */
                    268: /* where we left it.  Normally this should be invoked as               */
                    269: /* CORD_from_file(fopen(...))                                          */
                    270: /* CORD_from_file arranges to close the file descriptor when it is no  */
                    271: /* longer needed (e.g. when the result becomes inaccessible).          */ 
                    272: /* The file f must be such that ftell reflects the actual character    */
                    273: /* position in the file, i.e. the number of characters that can be     */
                    274: /* or were read with fread.  On UNIX systems this is always true.  On  */
                    275: /* MS Windows systems, f must be opened in binary mode.                        */
                    276: CORD CORD_from_file(FILE * f);
                    277: 
                    278: /* Equivalent to the above, except that the entire file will be read   */
                    279: /* and the file pointer will be closed immediately.                    */
                    280: /* The binary mode restriction from above does not apply.              */
                    281: CORD CORD_from_file_eager(FILE * f);
                    282: 
                    283: /* Equivalent to the above, except that the file will be read on demand.*/
                    284: /* The binary mode restriction applies.                                        */
                    285: CORD CORD_from_file_lazy(FILE * f);
                    286: 
                    287: /* Turn a cord into a C string.        The result shares no structure with     */
                    288: /* x, and is thus modifiable.                                          */
                    289: char * CORD_to_char_star(CORD x);
                    290: 
                    291: /* Turn a C string into a CORD.  The C string is copied, and so may    */
                    292: /* subsequently be modified.                                           */
                    293: CORD CORD_from_char_star(const char *s);
                    294: 
                    295: /* Identical to the above, but the result may share structure with     */
                    296: /* the argument and is thus not modifiable.                            */
                    297: const char * CORD_to_const_char_star(CORD x); 
                    298: 
                    299: /* Write a cord to a file, starting at the current position.  No       */
                    300: /* trailing NULs are newlines are added.                               */
                    301: /* Returns EOF if a write error occurs, 1 otherwise.                   */
                    302: int CORD_put(CORD x, FILE * f);
                    303: 
                    304: /* "Not found" result for the following two functions.                 */
                    305: # define CORD_NOT_FOUND ((size_t)(-1))
                    306: 
                    307: /* A vague analog of strchr.  Returns the position (an integer, not    */
                    308: /* a pointer) of the first occurrence of (char) c inside x at position         */
                    309: /* i or later. The value i must be < CORD_len(x).                      */
                    310: size_t CORD_chr(CORD x, size_t i, int c);
                    311: 
                    312: /* A vague analog of strrchr.  Returns index of the last occurrence    */
                    313: /* of (char) c inside x at position i or earlier. The value i          */
                    314: /* must be < CORD_len(x).                                              */
                    315: size_t CORD_rchr(CORD x, size_t i, int c);
                    316: 
                    317: 
                    318: /* The following are also not primitive, but are implemented in        */
                    319: /* cordprnt.c.  They provide functionality similar to the ANSI C       */
                    320: /* functions with corresponding names, but with the following          */
                    321: /* additions and changes:                                              */
                    322: /* 1. A %r conversion specification specifies a CORD argument.  Field  */
                    323: /*    width, precision, etc. have the same semantics as for %s.                */
                    324: /*    (Note that %c,%C, and %S were already taken.)                    */
                    325: /* 2. The format string is represented as a CORD.                      */
                    326: /* 3. CORD_sprintf and CORD_vsprintf assign the result through the 1st */      /*    argument. Unlike their ANSI C versions, there is no need to guess */
                    327: /*    the correct buffer size.                                         */
                    328: /* 4. Most of the conversions are implement through the native                 */
                    329: /*    vsprintf.  Hence they are usually no faster, and                         */
                    330: /*    idiosyncracies of the native printf are preserved.  However,     */
                    331: /*    CORD arguments to CORD_sprintf and CORD_vsprintf are NOT copied; */
                    332: /*    the result shares the original structure.  This may make them    */
                    333: /*    very efficient in some unusual applications.                     */
                    334: /*    The format string is copied.                                     */
                    335: /* All functions return the number of characters generated or -1 on    */
                    336: /* error.  This complies with the ANSI standard, but is inconsistent   */
                    337: /* with some older implementations of sprintf.                         */
                    338: 
                    339: /* The implementation of these is probably less portable than the rest */
                    340: /* of this package.                                                    */
                    341: 
                    342: #ifndef CORD_NO_IO
                    343: 
                    344: #include <stdarg.h>
                    345: 
                    346: int CORD_sprintf(CORD * out, CORD format, ...);
                    347: int CORD_vsprintf(CORD * out, CORD format, va_list args);
                    348: int CORD_fprintf(FILE * f, CORD format, ...);
                    349: int CORD_vfprintf(FILE * f, CORD format, va_list args);
                    350: int CORD_printf(CORD format, ...);
                    351: int CORD_vprintf(CORD format, va_list args);
                    352: 
                    353: #endif /* CORD_NO_IO */
                    354: 
                    355: # endif /* CORD_H */

E-mail: