Annotation of parser3/src/lib/cord/cordxtra.c, revision 1.13
1.2 paf 1: /*
2: * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
3: *
4: * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5: * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
6: *
7: * Permission is hereby granted to use or copy this program
8: * for any purpose, provided the above notices are retained on all copies.
9: * Permission to modify the code and to distribute modified code is granted,
10: * provided the above notices are retained, and a notice that the code was
11: * modified is included with the above copyright notice.
12: *
13: * Author: Hans-J. Boehm (boehm@parc.xerox.com)
14: */
15: /*
16: * These are functions on cords that do not need to understand their
17: * implementation. They serve also serve as example client code for
18: * cord_basics.
19: */
20: /* Boehm, December 8, 1995 1:53 pm PST */
1.7 paf 21: #include "pa_config_includes.h"
1.2 paf 22: # include <stdio.h>
23: # include <string.h>
24: # include <stdlib.h>
25: # include <stdarg.h>
26: # include "cord.h"
27: # include "ec.h"
1.9 misha 28:
1.2 paf 29: # define I_HIDE_POINTERS /* So we get access to allocation lock. */
30: /* We use this for lazy file reading, */
31: /* so that we remain independent */
32: /* of the threads primitives. */
33: # include "gc.h"
34:
35: /* For now we assume that pointer reads and writes are atomic, */
36: /* i.e. another thread always sees the state before or after */
37: /* a write. This might be false on a Motorola M68K with */
38: /* pointers that are not 32-bit aligned. But there probably */
39: /* aren't too many threads packages running on those. */
40: # define ATOMIC_WRITE(x,y) (x) = (y)
41: # define ATOMIC_READ(x) (*(x))
42:
43: /* The standard says these are in stdio.h, but they aren't always: */
44: # ifndef SEEK_SET
45: # define SEEK_SET 0
46: # endif
47: # ifndef SEEK_END
48: # define SEEK_END 2
49: # endif
50:
51: # define BUFSZ 2048 /* Size of stack allocated buffers when */
52: /* we want large buffers. */
53:
54: typedef void (* oom_fn)(void);
55:
56: # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
57: ABORT("Out of memory\n"); }
58: # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
59:
60: CORD CORD_cat_char(CORD x, char c)
61: {
62: register char * string;
63:
64: if (c == '\0') return(CORD_cat(x, CORD_nul(1)));
65: string = GC_MALLOC_ATOMIC(2);
66: if (string == 0) OUT_OF_MEMORY;
67: string[0] = c;
68: string[1] = '\0';
69: return(CORD_cat_char_star(x, string, 1));
70: }
71:
72: CORD CORD_catn(int nargs, ...)
73: {
74: register CORD result = CORD_EMPTY;
75: va_list args;
76: register int i;
77:
78: va_start(args, nargs);
79: for (i = 0; i < nargs; i++) {
80: register CORD next = va_arg(args, CORD);
81: result = CORD_cat(result, next);
82: }
83: va_end(args);
84: return(result);
85: }
86:
87: typedef struct {
88: size_t len;
89: size_t count;
90: char * buf;
91: } CORD_fill_data;
92:
93: int CORD_fill_proc(char c, void * client_data)
94: {
95: register CORD_fill_data * d = (CORD_fill_data *)client_data;
96: register size_t count = d -> count;
97:
98: (d -> buf)[count] = c;
99: d -> count = ++count;
100: if (count >= d -> len) {
101: return(1);
102: } else {
103: return(0);
104: }
105: }
106:
107: int CORD_batched_fill_proc(const char* s, void * client_data)
108: {
109: register CORD_fill_data * d = (CORD_fill_data *)client_data;
110: register size_t count = d -> count;
111: register size_t max = d -> len;
112: register char * buf = d -> buf;
113: register const char* t = s;
114:
115: while((buf[count] = *t++) != '\0') {
116: count++;
117: if (count >= max) {
118: d -> count = count;
119: return(1);
120: }
121: }
122: d -> count = count;
123: return(0);
124: }
125:
126: /* Fill buf with len characters starting at i. */
127: /* Assumes len characters are available. */
128: void CORD_fill_buf(CORD x, size_t i, size_t len, char * buf)
129: {
130: CORD_fill_data fd;
131:
132: fd.len = len;
133: fd.buf = buf;
134: fd.count = 0;
135: (void)CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
136: }
137:
138: int CORD_cmp(CORD x, CORD y)
139: {
140: CORD_pos xpos;
141: CORD_pos ypos;
142: register size_t avail, yavail;
143:
144: if (y == CORD_EMPTY) return(x != CORD_EMPTY);
145: if (x == CORD_EMPTY) return(-1);
146: if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y));
147: CORD_set_pos(xpos, x, 0);
148: CORD_set_pos(ypos, y, 0);
149: for(;;) {
150: if (!CORD_pos_valid(xpos)) {
151: if (CORD_pos_valid(ypos)) {
152: return(-1);
153: } else {
154: return(0);
155: }
156: }
157: if (!CORD_pos_valid(ypos)) {
158: return(1);
159: }
160: if ((avail = CORD_pos_chars_left(xpos)) <= 0
161: || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
162: register char xcurrent = CORD_pos_fetch(xpos);
163: register char ycurrent = CORD_pos_fetch(ypos);
164: if (xcurrent != ycurrent) return(xcurrent - ycurrent);
165: CORD_next(xpos);
166: CORD_next(ypos);
167: } else {
168: /* process as many characters as we can */
169: register int result;
170:
171: if (avail > yavail) avail = yavail;
172: result = strncmp(CORD_pos_cur_char_addr(xpos),
173: CORD_pos_cur_char_addr(ypos), avail);
174: if (result != 0) return(result);
175: CORD_pos_advance(xpos, avail);
176: CORD_pos_advance(ypos, avail);
177: }
178: }
179: }
180:
181: int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
182: {
183: CORD_pos xpos;
184: CORD_pos ypos;
185: register size_t count;
186: register long avail, yavail;
187:
188: CORD_set_pos(xpos, x, x_start);
189: CORD_set_pos(ypos, y, y_start);
190: for(count = 0; count < len;) {
191: if (!CORD_pos_valid(xpos)) {
192: if (CORD_pos_valid(ypos)) {
193: return(-1);
194: } else {
195: return(0);
196: }
197: }
198: if (!CORD_pos_valid(ypos)) {
199: return(1);
200: }
201: if ((avail = CORD_pos_chars_left(xpos)) <= 0
202: || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
203: register char xcurrent = CORD_pos_fetch(xpos);
204: register char ycurrent = CORD_pos_fetch(ypos);
205: if (xcurrent != ycurrent) return(xcurrent - ycurrent);
206: CORD_next(xpos);
207: CORD_next(ypos);
208: count++;
209: } else {
210: /* process as many characters as we can */
211: register int result;
212:
213: if (avail > yavail) avail = yavail;
214: count += avail;
215: if (count > len) avail -= (count - len);
216: result = strncmp(CORD_pos_cur_char_addr(xpos),
217: CORD_pos_cur_char_addr(ypos), (size_t)avail);
218: if (result != 0) return(result);
219: CORD_pos_advance(xpos, (size_t)avail);
220: CORD_pos_advance(ypos, (size_t)avail);
221: }
222: }
223: return(0);
224: }
225:
1.13 ! misha 226: char * CORD_to_char_star(CORD x, size_t len)
1.2 paf 227: {
1.13 ! misha 228: char * result;
! 229: if(0 == len) len = CORD_len(x);
! 230: result = GC_MALLOC_ATOMIC(len + 1);
1.2 paf 231:
232: if (result == 0) OUT_OF_MEMORY;
233: CORD_fill_buf(x, 0, len, result);
234: result[len] = '\0';
235: return(result);
236: }
237:
238: CORD CORD_from_char_star(const char* s)
239: {
240: char * result;
241: size_t len = strlen(s);
242:
243: if (0 == len) return(CORD_EMPTY);
244: result = GC_MALLOC_ATOMIC(len + 1);
245: if (result == 0) OUT_OF_MEMORY;
246: memcpy(result, s, len+1);
247: return(result);
248: }
249:
1.13 ! misha 250: const char* CORD_to_const_char_star(CORD x, size_t len)
1.2 paf 251: {
252: if (x == 0) return("");
253: if (CORD_IS_STRING(x)) return((const char* )x);
1.13 ! misha 254: return(CORD_to_char_star(x, len));
1.2 paf 255: }
256:
257: char CORD_fetch(CORD x, size_t i)
258: {
259: CORD_pos xpos;
260:
261: CORD_set_pos(xpos, x, i);
262: if (!CORD_pos_valid(xpos)) ABORT("bad index?");
263: return(CORD_pos_fetch(xpos));
264: }
265:
266:
267: int CORD_put_proc(char c, void * client_data)
268: {
269: register FILE * f = (FILE *)client_data;
270:
271: return(putc(c, f) == EOF);
272: }
273:
274: int CORD_batched_put_proc(const char* s, void * client_data)
275: {
276: register FILE * f = (FILE *)client_data;
277:
278: return(fputs(s, f) == EOF);
279: }
280:
281:
282: int CORD_put(CORD x, FILE * f)
283: {
284: if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) {
285: return(EOF);
286: } else {
287: return(1);
288: }
289: }
290:
291: typedef struct {
292: size_t pos; /* Current position in the cord */
293: char target; /* Character we're looking for */
294: } chr_data;
295:
296: int CORD_chr_proc(char c, void * client_data)
297: {
298: register chr_data * d = (chr_data *)client_data;
299:
300: if (c == d -> target) return(1);
301: (d -> pos) ++;
302: return(0);
303: }
1.4 paf 304:
1.2 paf 305: int CORD_rchr_proc(char c, void * client_data)
306: {
307: register chr_data * d = (chr_data *)client_data;
308:
309: if (c == d -> target) return(1);
310: (d -> pos) --;
311: return(0);
312: }
313:
314: int CORD_batched_chr_proc(const char* s, void * client_data)
315: {
316: register chr_data * d = (chr_data *)client_data;
317: register char * occ = strchr(s, d -> target);
318:
319: if (occ == 0) {
320: d -> pos += strlen(s);
321: return(0);
322: } else {
323: d -> pos += occ - s;
324: return(1);
325: }
326: }
327:
328: size_t CORD_chr(CORD x, size_t i, int c)
329: {
330: chr_data d;
331:
332: d.pos = i;
333: d.target = c;
334: if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
335: return(d.pos);
336: } else {
337: return(CORD_NOT_FOUND);
338: }
339: }
340:
341: size_t CORD_rchr(CORD x, size_t i, int c)
342: {
343: chr_data d;
344:
345: d.pos = i;
346: d.target = c;
347: if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
348: return(d.pos);
349: } else {
350: return(CORD_NOT_FOUND);
351: }
352: }
353:
354: /* Find the first occurrence of s in x at position start or later. */
355: /* This uses an asymptotically poor algorithm, which should typically */
356: /* perform acceptably. We compare the first few characters directly, */
357: /* and call CORD_ncmp whenever there is a partial match. */
358: /* This has the advantage that we allocate very little, or not at all. */
359: /* It's very fast if there are few close misses. */
1.13 ! misha 360: size_t CORD_str(CORD x, size_t start, CORD s, size_t xlen)
1.2 paf 361: {
362: CORD_pos xpos;
363: size_t slen;
364: register size_t start_len;
365: const char* s_start;
366: unsigned long s_buf = 0; /* The first few characters of s */
367: unsigned long x_buf = 0; /* Start of candidate substring. */
368: /* Initialized only to make compilers */
369: /* happy. */
370: unsigned long mask = 0;
371: register size_t i;
372: register size_t match_pos;
373:
374: if (s == CORD_EMPTY) return(start);
375: if (CORD_IS_STRING(s)) {
376: s_start = s;
377: slen = strlen(s);
378: } else {
1.13 ! misha 379: s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long), 0), 0);
1.2 paf 380: slen = CORD_len(s);
381: }
382: if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND);
383: start_len = slen;
384: if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long);
385: CORD_set_pos(xpos, x, start);
386: for (i = 0; i < start_len; i++) {
387: mask <<= 8;
388: mask |= 0xff;
389: s_buf <<= 8;
1.3 paf 390: s_buf |= (unsigned char)s_start[i];
1.2 paf 391: x_buf <<= 8;
1.5 paf 392: x_buf |= (unsigned char)CORD_pos_fetch(xpos);
1.2 paf 393: CORD_next(xpos);
394: }
395: for (match_pos = start; ; match_pos++) {
396: if ((x_buf & mask) == s_buf) {
397: if (slen == start_len ||
398: CORD_ncmp(x, match_pos + start_len,
399: s, start_len, slen - start_len) == 0) {
400: return(match_pos);
401: }
402: }
403: if ( match_pos == xlen - slen ) {
404: return(CORD_NOT_FOUND);
405: }
406: x_buf <<= 8;
1.5 paf 407: x_buf |= (unsigned char)CORD_pos_fetch(xpos);
1.2 paf 408: CORD_next(xpos);
409: }
410: }
411:
412: void CORD_ec_flush_buf(CORD_ec x)
413: {
414: register size_t len = x[0].ec_bufptr - x[0].ec_buf;
415: char * s;
416:
417: if (len == 0) return;
418: s = GC_MALLOC_ATOMIC(len+1);
419: memcpy(s, x[0].ec_buf, len);
420: s[len] = '\0';
421: x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
422: x[0].ec_bufptr = x[0].ec_buf;
423: }
424:
425: void CORD_ec_append_cord(CORD_ec x, CORD s)
426: {
427: CORD_ec_flush_buf(x);
428: x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
429: }
430:
431: /*ARGSUSED*/
432: char CORD_nul_func(size_t i, void * client_data)
433: {
434: return((char)(unsigned long)client_data);
1.4 paf 435: }
436:
1.9 misha 437: #ifdef CORD_CHARS_CACHE
1.11 misha 438: static char* cord_chars_cache[256][15]={0};
1.9 misha 439: #endif
440:
1.2 paf 441: CORD CORD_chars(char c, size_t i)
442: {
1.8 misha 443: if (i>0 && i<16 /* SHORT_LIMIT */) {
1.10 misha 444: register char* result;
1.9 misha 445: #ifdef CORD_CHARS_CACHE
1.11 misha 446: if(cord_chars_cache[(unsigned char)c][i])
447: return((CORD) cord_chars_cache[(unsigned char)c][i]);
1.9 misha 448: #endif
1.8 misha 449: result=GC_MALLOC_ATOMIC(i+1);
450: if(result==0) OUT_OF_MEMORY;
451: memset(result, c, i);
452: result[i] = '\0';
1.9 misha 453: #ifdef CORD_CHARS_CACHE
1.11 misha 454: cord_chars_cache[(unsigned char)c][i]=result;
1.9 misha 455: #endif
1.8 misha 456: return((CORD) result);
457: } else {
458: return(CORD_from_fn(CORD_nul_func, (void *)(unsigned long)c, i));
459: }
1.2 paf 460: }
461:
462: CORD CORD_from_file_eager(FILE * f)
463: {
464: register int c;
465: CORD_ec ecord;
466:
467: CORD_ec_init(ecord);
468: for(;;) {
469: c = getc(f);
470: if (c == 0) {
471: /* Append the right number of NULs */
472: /* Note that any string of NULs is rpresented in 4 words, */
473: /* independent of its length. */
474: register size_t count = 1;
475:
476: CORD_ec_flush_buf(ecord);
477: while ((c = getc(f)) == 0) count++;
478: ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
479: }
480: if (c == EOF) break;
481: CORD_ec_append(ecord, c);
482: }
483: (void) fclose(f);
484: return(CORD_balance(CORD_ec_to_cord(ecord)));
485: }
486:
487: /* The state maintained for a lazily read file consists primarily */
488: /* of a large direct-mapped cache of previously read values. */
489: /* We could rely more on stdio buffering. That would have 2 */
490: /* disadvantages: */
491: /* 1) Empirically, not all fseek implementations preserve the */
492: /* buffer whenever they could. */
493: /* 2) It would fail if 2 different sections of a long cord */
494: /* were being read alternately. */
495: /* We do use the stdio buffer for read ahead. */
496: /* To guarantee thread safety in the presence of atomic pointer */
497: /* writes, cache lines are always replaced, and never modified in */
498: /* place. */
499:
500: # define LOG_CACHE_SZ 14
501: # define CACHE_SZ (1 << LOG_CACHE_SZ)
502: # define LOG_LINE_SZ 9
503: # define LINE_SZ (1 << LOG_LINE_SZ)
504:
505: typedef struct {
506: size_t tag;
507: char data[LINE_SZ];
508: /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ */
509: } cache_line;
510:
511: typedef struct {
512: FILE * lf_file;
513: size_t lf_current; /* Current file pointer value */
514: cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ];
515: } lf_state;
516:
517: # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
518: # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
519: # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
520: # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
521: # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
522:
523: typedef struct {
524: lf_state * state;
525: size_t file_pos; /* Position of needed character. */
526: cache_line * new_cache;
527: } refill_data;
528:
529: /* Executed with allocation lock. */
530: static char refill_cache(client_data)
531: refill_data * client_data;
532: {
533: register lf_state * state = client_data -> state;
534: register size_t file_pos = client_data -> file_pos;
535: FILE *f = state -> lf_file;
536: size_t line_start = LINE_START(file_pos);
537: size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
538: cache_line * new_cache = client_data -> new_cache;
539:
540: if (line_start != state -> lf_current
541: && fseek(f, line_start, SEEK_SET) != 0) {
542: ABORT("fseek failed");
543: }
544: if (fread(new_cache -> data, sizeof(char), LINE_SZ, f)
545: <= file_pos - line_start) {
546: ABORT("fread failed");
547: }
548: new_cache -> tag = DIV_LINE_SZ(file_pos);
549: /* Store barrier goes here. */
550: ATOMIC_WRITE(state -> lf_cache[line_no], new_cache);
551: state -> lf_current = line_start + LINE_SZ;
552: return(new_cache->data[MOD_LINE_SZ(file_pos)]);
553: }
554:
555: char CORD_lf_func(size_t i, void * client_data)
556: {
557: register lf_state * state = (lf_state *)client_data;
558: register cache_line * volatile * cl_addr =
559: &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]);
560: register cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr);
561:
562: if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) {
563: /* Cache miss */
564: refill_data rd;
565:
566: rd.state = state;
567: rd.file_pos = i;
568: rd.new_cache = GC_NEW_ATOMIC(cache_line);
569: if (rd.new_cache == 0) OUT_OF_MEMORY;
570: return((char)(GC_word)
571: GC_call_with_alloc_lock((GC_fn_type) refill_cache, &rd));
572: }
573: return(cl -> data[MOD_LINE_SZ(i)]);
574: }
575:
576: /*ARGSUSED*/
577: void CORD_lf_close_proc(void * obj, void * client_data)
578: {
579: if (fclose(((lf_state *)obj) -> lf_file) != 0) {
580: ABORT("CORD_lf_close_proc: fclose failed");
581: }
582: }
583:
584: #ifndef PA_DEBUG_DISABLE_GC
585: CORD CORD_from_file_lazy_inner(FILE * f, size_t len)
586: {
587: register lf_state * state = GC_NEW(lf_state);
588: register int i;
589:
590: if (state == 0) OUT_OF_MEMORY;
591: if (len != 0) {
592: /* Dummy read to force buffer allocation. */
593: /* This greatly increases the probability */
594: /* of avoiding deadlock if buffer allocation */
595: /* is redirected to GC_malloc and the */
596: /* world is multithreaded. */
597: char buf[1];
598:
599: (void) fread(buf, 1, 1, f);
600: rewind(f);
601: }
602: state -> lf_file = f;
603: for (i = 0; i < CACHE_SZ/LINE_SZ; i++) {
604: state -> lf_cache[i] = 0;
605: }
606: state -> lf_current = 0;
607: GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
608: return(CORD_from_fn(CORD_lf_func, state, len));
609: }
610:
611: CORD CORD_from_file_lazy(FILE * f)
612: {
613: register long len;
614:
615: if (fseek(f, 0l, SEEK_END) != 0) {
616: ABORT("Bad fd argument - fseek failed");
617: }
618: if ((len = ftell(f)) < 0) {
619: ABORT("Bad fd argument - ftell failed");
620: }
621: rewind(f);
622: return(CORD_from_file_lazy_inner(f, (size_t)len));
623: }
624:
625: # define LAZY_THRESHOLD (128*1024 + 1)
626:
627: CORD CORD_from_file(FILE * f)
628: {
629: register long len;
630:
631: if (fseek(f, 0l, SEEK_END) != 0) {
632: ABORT("Bad fd argument - fseek failed");
633: }
634: if ((len = ftell(f)) < 0) {
635: ABORT("Bad fd argument - ftell failed");
636: }
637: rewind(f);
638: if (len < LAZY_THRESHOLD) {
639: return(CORD_from_file_eager(f));
640: } else {
641: return(CORD_from_file_lazy_inner(f, (size_t)len));
642: }
643: }
644: #endif
E-mail: