From 1cd14ad69f9944f9f5315fc07a55776d4087ab1e Mon Sep 17 00:00:00 2001 From: Kasper Date: Wed, 19 Mar 2025 22:04:23 +0200 Subject: [PATCH] finish cleanup --- 3rd/str/.editorconfig | 11 + 3rd/str/.gitignore | 4 + 3rd/str/LICENSE | 30 ++ 3rd/str/Makefile | 51 ++ 3rd/str/README.md | 440 ++++++++++++++++ 3rd/str/snippets.md | 63 +++ 3rd/{ => str}/str.c | 0 3rd/{ => str}/str.h | 0 3rd/str/str_test.c | 907 +++++++++++++++++++++++++++++++++ 3rd/str/tools/file-to-str | 30 ++ 3rd/str/tools/gen_char_class.c | 209 ++++++++ 11 files changed, 1745 insertions(+) create mode 100644 3rd/str/.editorconfig create mode 100644 3rd/str/.gitignore create mode 100644 3rd/str/LICENSE create mode 100644 3rd/str/Makefile create mode 100644 3rd/str/README.md create mode 100644 3rd/str/snippets.md rename 3rd/{ => str}/str.c (100%) rename 3rd/{ => str}/str.h (100%) create mode 100644 3rd/str/str_test.c create mode 100755 3rd/str/tools/file-to-str create mode 100644 3rd/str/tools/gen_char_class.c diff --git a/3rd/str/.editorconfig b/3rd/str/.editorconfig new file mode 100644 index 0000000..76bd3a3 --- /dev/null +++ b/3rd/str/.editorconfig @@ -0,0 +1,11 @@ +root = true + +[*] +indent_style = tab +indent_size = 4 +trim_trailing_whitespace = true +insert_final_newline = true +end_of_line = lf + +[Makefile] +indent_size = 8 diff --git a/3rd/str/.gitignore b/3rd/str/.gitignore new file mode 100644 index 0000000..c740f5e --- /dev/null +++ b/3rd/str/.gitignore @@ -0,0 +1,4 @@ +test +flto-test +*.bak +tools/gen-char-class diff --git a/3rd/str/LICENSE b/3rd/str/LICENSE new file mode 100644 index 0000000..60be582 --- /dev/null +++ b/3rd/str/LICENSE @@ -0,0 +1,30 @@ +BSD 3-Clause License + +Copyright (c) 2020,2021,2022,2023,2024 Maxim Konakov and contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/3rd/str/Makefile b/3rd/str/Makefile new file mode 100644 index 0000000..acc3214 --- /dev/null +++ b/3rd/str/Makefile @@ -0,0 +1,51 @@ +# flags +CC_WARN := -Wall -Wextra -Werror=implicit-function-declaration -Wformat -Werror=format-security + +ifeq ($(CC),musl-gcc) +# musl is ISO 10646 compliant but doesn't define __STDC_ISO_10646__ +CC_EXTRA := -D__STDC_ISO_10646__=201706L +else +# sanitisers only work for non-musl builds +CC_SAN := -fsanitize=address -fsanitize=leak -fsanitize=undefined -fsanitize-address-use-after-scope +endif + +test: CFLAGS := -ggdb -std=c11 -pipe $(CC_WARN) $(CC_EXTRA) -fno-omit-frame-pointer $(CC_SAN) +flto-test: CFLAGS := -s -O2 -pipe -std=c11 $(CC_WARN) $(CC_EXTRA) -flto -march=native -mtune=native +tools: CFLAGS := -s -O2 -pipe -std=c11 $(CC_WARN) $(CC_EXTRA) + +# str library source files +SRC := str.c str.h str_test.c + +# all +.PHONY: all +all: tools test flto-test + +.PHONY: clean +clean: clean-test clean-tools + +# test +test: $(SRC) + $(CC) $(CFLAGS) -o $@ $(filter %.c,$^) + ./$@ + +flto-test: $(SRC) + $(CC) $(CFLAGS) -o $@ $(filter %.c,$^) + ./$@ + +.PHONY: clean-test +clean-test: + rm -f test flto-test + +# tools +GEN_CHAR_CLASS := tools/gen-char-class + +.PHONY: tools +tools: $(GEN_CHAR_CLASS) + +# gen-char-class +$(GEN_CHAR_CLASS): tools/gen_char_class.c + $(CC) $(CFLAGS) -o $@ $(filter %.c,$^) + +.PHONY: clean-tools +clean-tools: + rm -f $(GEN_CHAR_CLASS) diff --git a/3rd/str/README.md b/3rd/str/README.md new file mode 100644 index 0000000..2ea794f --- /dev/null +++ b/3rd/str/README.md @@ -0,0 +1,440 @@ +# str: yet another string library for C language. + +[![License: BSD 3 Clause](https://img.shields.io/badge/License-BSD_3--Clause-yellow.svg)](https://opensource.org/licenses/BSD-3-Clause) + +## Motivation + +Bored with developing the same functionality over and over again, unsatisfied +with existing libraries, so decided to make the right one, once and forever. ๐Ÿ™‚ + +## Features + +* Handles both C and binary strings; +* Light-weight references to strings: cheap to create, copy, or pass by value; +* Support for copy and move semantics, although not enforceable by the C language; +* String composition functions writing to memory, file descriptors, or file streams; +* Can be compiled using `gcc` or `clang`, and linked with `libc` or `musl`. + +## Installation +Just clone the project and copy (or symlink) the files `str.h` and `str.c` into your project, +but please respect the [license](LICENSE). + +## Code Examples + +String composition: + +```C +str s = str_null; + +str_join(&s, str_lit(", "), + str_lit("Here"), + str_lit("there"), + str_lit("and everywhere")); + +str_cat(&s, s, str_lit("...")); + +assert(str_eq(s, str_lit("Here, there, and everywhere..."))); +str_free(s); +``` + +Same as above, but writing to a file: + +```C +FILE* const stream = fopen(...); + +int err = str_join(stream, str_lit(", "), + str_lit("Here"), + str_lit("there"), + str_lit("and everywhere...")); + +if(err != 0) { /* handle the error */ } +``` + +[Discussion](https://news.ycombinator.com/item?id=25212864) on Hacker News. + +## User Guide + +_**Disclaimer:** This is the good old C language, not C++ or Rust, so nothing can be enforced +on the language level, and certain discipline is required to make sure there is no corrupt +or leaked memory resulting from using this library._ + +A string is represented by the type `str` that maintains a pointer to some memory containing the +actual string, and the length of the string. Objects of type `str` are small enough (a struct +of a `const char*` and a `size_t`) to be cheap to create, copy (pass by value), and move. The +`str` structure should be treated as opaque (i.e., do not attempt to directly access or modify +the fields in this structure). The strings are assumed to be immutable, like those in Java or +Go, but only by means of `const char*` pointers, so it is actually possible to modify such a +string, although the required type cast to `char*` offers at least some (mostly psychological) +protection from changing the string by mistake. + +This library focusses only on handling strings, not gradually composing them like +[StringBuffer](https://docs.oracle.com/javase/7/docs/api/java/lang/StringBuffer.html) +class in Java. + +All string objects must be initialised before use. Uninitialised objects will cause +undefined behaviour. Use the provided constructors, or `str_null` for empty strings. + +There are two kinds of `str` objects: those actually owning the memory they point to, and +non-owning references. This property can be queried using `str_is_owner` and `str_is_ref` +functions, otherwise such objects are indistinguishable. + +Non-owning string objects are safe to copy and assign to each other, as long as the memory +they refer to is valid. They do not need to be freed. `str_free` is a no-op for reference +objects. A reference object can be cheaply created from a C string, a string literal, +or from a range of bytes. + +Owning objects require special treatment, in particular: +* It is a good idea to have only one owning object per each allocated string, but such +a string can have many references to its underlying string, as long as those references do not +outlive the owning object. +Sometimes this rule may be relaxed for code clarity, like in the above example where +the owning object is passed directly to a function, but only if the function does not +store or release the object. When in doubt pass such an object via `str_ref`. +* Direct assignments (like `s2 = s1;`) to owning objects will certainly leak memory, use +`str_assign` function instead. In fact, this function can assign to any string object, +owning or not, so it can be used everywhere, just to avoid any doubt. +* There is no automatic memory management in C, so every owning object must be released at +some point using either `str_free` or `str_clear` function. String objects on the stack +can also be declared as `str_auto` (or `const str_auto`) for automatic cleanup when the variable +goes out of scope. +* An owning object can be moved to another location by using `str_move` function. The +function resets its source object to an empty string. +* Object ownership can be passed over to another object by using `str_pass` function. The +function sets its source to a non-owning reference to the original string. + +It is technically possible to create a reference to a string that is not +null-terminated. The library accepts strings without null-terminators, but every new string +allocated by the library is guaranteed to be null-terminated. + +### String Construction + +A string object can be constructed form any C string, string literal, or a range of bytes. +The provided constructors are computationally cheap to apply. Depending on the constructor, +the new object can either own the actual string it refers to, or be a non-owning reference. +Constructors themselves do not allocate any memory. Importantly, constructors are the only +functions in this library that return a string object, while others only assign their results +through a pointer to a pre-existing string. This makes constructors suitable for initialisation +of new string objects. In all other situations one should combine construction with assignment, +for example:
+`str_assign(&dest, str_acquire_chars(buff, n));` + +### String Object Properties + +Querying a property of a string object (like the length of the string via `str_len`) is a +cheap operation. + +### Assigning, Moving, and Passing String Objects + +C language does not allow for operator overloading, so this library provides a function +`str_assign` that takes a string object and assigns it to the destination object, freeing +any memory owned by the destination. It is generally recommended to use this function +everywhere outside object initialisation. + +An existing object can be moved over to another location via `str_move` function. +The function resets the source object to `str_null` to guarantee the correct move semantics. +The value returned by `str_move` may be either used to initialise a new object, or +assigned to an existing object using `str_assign`. + +An existing object can also be passed over to another location via `str_pass` function. The function +sets the source object to be a non-owning reference to the original string, otherwise the semantics +and usage is the same as `str_move`. + +### String Composition and Generic Destination + +String composition [functions](#string-composition) can write their results to different +destinations, depending on the _type_ of their `dest` parameter: + +* `str*`: result is assigned to the string object; +* `int`: result is written to the file descriptor; +* `FILE*` result is written to the file stream. + +The composition functions return 0 on success, or the value of `errno` as retrieved at the point +of failure (including `ENOMEM` on memory allocation error). + +### Detailed Example + +Just to make things more clear, here is the same code as in the example above, but with comments: +```C +// declare a variable and initialise it with an empty string; could also be declared as "str_auto" +// to avoid explicit call to str_free() below. +str s = str_null; + +// join the given string literals around the separator (second parameter), +// storing the result in object "s" (first parameter); in this example we do not check +// the return values of the composition functions, thus ignoring memory allocation failures, +// which is probably not the best idea in general. +str_join(&s, str_lit(", "), + str_lit("Here"), + str_lit("there"), + str_lit("and everywhere")); + +// create a new string concatenating "s" and a literal; the function only modifies its +// destination object "s" after the result is computed, also freeing the destination +// before the assignment, so it is safe to use "s" as both a parameter and a destination. +// note: we pass a copy of the owning object "s" as the second parameter, and here it is +// safe to do so because this particular function does not modify its arguments. +str_cat(&s, s, str_lit("...")); + +// check that we have got the expected result +assert(str_eq(s, str_lit("Here, there, and everywhere..."))); + +// finally, free the memory allocated for the string +str_free(s); +``` + +There are some useful [code snippets](snippets.md) provided to assist with writing code using +this library. + +## API brief + +`typedef struct { ... } str;`
+The string object. + +#### String Properties + +`size_t str_len(const str s)`
+Returns the number of bytes in the string referenced by the object. + +`const char* str_ptr(const str s)`
+Returns a pointer to the first byte of the string referenced by the object. The pointer is never NULL. + +`const char* str_end(const str s)`
+Returns a pointer to the next byte past the end of the string referenced by the object. +The pointer is never NULL, but it is not guaranteed to point to any valid byte or location. +For C strings it points to the terminating null character. For any given string `s` the following +condition is always satisfied: `str_end(s) == str_ptr(s) + str_len(s)`. + +`bool str_is_empty(const str s)`
+Returns "true" for empty strings. + +`bool str_is_owner(const str s)`
+Returns "true" if the string object is the owner of the memory it references. + +`bool str_is_ref(const str s)`
+Returns "true" if the string object does not own the memory it references. + +#### String Construction + +`str_null`
+Empty string constant. + +`str str_lit(s)`
+Constructs a non-owning object from a string literal. Implemented as a macro. + +`str str_ref(s)`
+Constructs a non-owning object from either a null-terminated C string, or another `str` object. +Implemented as a macro. + +`str str_ref_chars(const char* const s, const size_t n)`
+Constructs a non-owning object referencing the given range of bytes. + +`str str_acquire_chars(const char* const s, const size_t n)`
+Constructs an owning object for the specified range of bytes. The pointer `s` should be safe +to pass to `free(3)` function. + +`str str_acquire(const char* const s)`
+Constructs an owning object from the given C string. The string should be safe to pass to +`free(3)` function. + +`str str_move(str* const ps)`
+Saves the given object to a temporary, resets the source object to `str_null`, and then +returns the saved object. + +`str str_pass(str* const ps)`
+Saves the given object to a temporary, sets the source object to be a non-owning reference to the +original string, and then returns the saved object. + +#### String Deallocation + +`void str_free(const str s)`
+Deallocates any memory held by the owning string object. No-op for references. After a call to +this function the string object is in unknown and unusable state. + +String objects on the stack can also be declared as `str_auto` instead of `str` to deallocate +any memory held by the string when the variable goes out of scope. + +#### String Modification + +`void str_assign(str* const ps, const str s)`
+Assigns the object `s` to the object pointed to by `ps`. Any memory owned by the target +object is freed before the assignment. + +`void str_clear(str* const ps)`
+Sets the target object to `str_null` after freeing any memory owned by the target. + +`void str_swap(str* const s1, str* const s2)`
+Swaps two string objects. + +`int str_from_file(str* const dest, const char* const file_name)`
+Reads the entire file (of up to 64MB by default, configurable via `STR_MAX_FILE_SIZE`) into +the destination string. Returns 0 on success, or the value of `errno` on error. + +#### String Comparison + +`int str_cmp(const str s1, const str s2)`
+Lexicographically compares the two string objects, with usual semantics. + +`bool str_eq(const str s1, const str s2)`
+Returns "true" if the two strings match exactly. + +`int str_cmp_ci(const str s1, const str s2)`
+Case-insensitive comparison of two strings, implemented using `strncasecmp(3)`. + +`bool str_eq_ci(const str s1, const str s2`
+Returns "true" is the two strings match case-insensitively. + +`bool str_has_prefix(const str s, const str prefix)`
+Tests if the given string `s` starts with the specified prefix. + +`bool str_has_suffix(const str s, const str suffix)`
+Tests if the given string `s` ends with the specified suffix. + +#### String Composition + +`int str_cpy(dest, const str src)`
+Copies the source string referenced by `src` to the +[generic](#string-composition-and-generic-destination) destination `dest`. Returns 0 on success, +or the value of `errno` on failure. + +`int str_cat_range(dest, const str* src, size_t count)`
+Concatenates `count` strings from the array starting at address `src`, and writes +the result to the [generic](#string-composition-and-generic-destination) destination `dest`. +Returns 0 on success, or the value of `errno` on failure. + +`int str_cat(dest, ...)`
+Concatenates a variable list of `str` arguments, and writes the result to the +[generic](#string-composition-and-generic-destination) destination `dest`. +Returns 0 on success, or the value of `errno` on failure. + +`int str_join_range(dest, const str sep, const str* src, size_t count)`
+Joins around `sep` the `count` strings from the array starting at address `src`, and writes +the result to the [generic](#string-composition-and-generic-destination) destination `dest`. +Returns 0 on success, or the value of `errno` on failure. + +`int str_join(dest, const str sep, ...)`
+Joins a variable list of `str` arguments around `sep` delimiter, and writes the result to the +[generic](#string-composition-and-generic-destination) destination `dest`. +Returns 0 on success, or the value of `errno` on failure. + +#### Searching and Sorting + +`bool str_partition(const str src, const str patt, str* const prefix, str* const suffix)`
+Splits the string `src` on the first match of `patt`, assigning a reference to the part +of the string before the match to the `prefix` object, and the part after the match to the +`suffix` object. Returns `true` if a match has been found, or `false` otherwise, also +setting `prefix` to reference the entire `src` string, and clearing the `suffix` object. +Empty pattern `patt` never matches. + +`void str_sort_range(const str_cmp_func cmp, str* const array, const size_t count)`
+Sorts the given array of `str` objects using the given comparison function. A number +of typically used comparison functions is also provided: +* `str_order_asc` (ascending sort) +* `str_order_desc` (descending sort) +* `str_order_asc_ci` (ascending case-insensitive sort) +* `str_order_desc_ci` (descending case-insensitive sort) + +`const str* str_search_range(const str key, const str* const array, const size_t count)`
+Binary search for the given key. The input array must be sorted using `str_order_asc`. +Returns a pointer to the string matching the key, or NULL. + +`size_t str_partition_range(bool (*pred)(const str), str* const array, const size_t count)`
+Reorders the string objects in the given range in such a way that all elements for which +the predicate `pred` returns "true" precede the elements for which predicate `pred` +returns "false". Returns the number of preceding objects. + +`size_t str_unique_range(str* const array, const size_t count)`
+Reorders the string objects in the given range in such a way that there are two partitions: +one where each object is unique within the input range, and another partition with all the +remaining objects. The unique partition is stored at the beginning of the array, and is +sorted in ascending order, followed by the partition with all remaining objects. +Returns the number of unique objects. + +#### UNICODE support + +`for_each_codepoint(var_name, src_string)`
+A macro that expands to a loop iterating over the given string `src_string` (of type `str`) by UTF-32 +code points. On each iteration the variable `var_name` (of type `char32_t`) is assigned +the value of the next valid UTF-32 code point from the source string. Upon exit from the loop the +variable has one on the following values: +* `CPI_END_OF_STRING`: the iteration has reached the end of source string; +* `CPI_ERR_INCOMPLETE_SEQ`: an incomplete byte sequence has been detected; +* `CPI_ERR_INVALID_ENCODING`: an invalid byte sequence has been detected. + +The source string is expected to be encoded in the _current program locale_, as set by the most +recent call to `setlocale(3)`. + +Usage pattern: +```c +#include +... +str s = ... +... +char32_t c; // variable to receive UTF-32 values on each iteration + +for_each_codepoint(c, s) +{ + /* process c */ +} + +if(c != CPI_END_OF_STRING) +{ + /* handle error */ +} +``` + +#### Tokeniser + +Tokeniser interface provides functionality similar to `strtok(3)` function. The tokeniser +is fully re-entrant with no hidden state, and its input string is not modified while being +parsed. + +##### Typical usage: +```C +// declare and initialise tokeniser state +str_tok_state state; + +str_tok_init(&state, source_string, delimiter_set); + +// object to receive tokens +str token = str_null; + +// token iterator +while(str_tok(&token, &state)) +{ + /* process "token" */ +} +``` + +##### Tokeniser API + +`void str_tok_init(str_tok_state* const state, const str src, const str delim_set)`
+Initialises tokeniser state with the given source string and delimiter set. The delimiter set +is treated as bytes, _not_ as UNICODE code points encoded in UTF-8. + +`bool str_tok(str* const dest, str_tok_state* const state)`
+Retrieves the next token and stores it in the `dest` object. Returns `true` if the token has +been read, or `false` if the end of input has been reached. Retrieved token is always +a reference to a slice of the source string. + +`void str_tok_delim(str_tok_state* const state, const str delim_set)`
+Changes the delimiter set associated with the given tokeniser state. The delimiter set is +treated as bytes, _not_ as UNICODE code points encoded in UTF-8. + +## Tools + +All the tools are located in `tools/` directory. Currently, there are the following tools: + +* `file-to-str`: The script takes a file (text or binary) and a C variable name, and +writes to `stdout` C source code where the variable (of type `str`) is defined +and initialised with the content of the file. + +* `gen-char-class`: Generates character classification functions that do the same as their +`isw*()` counterparts under the current locale as specified by `LC_ALL` environment variable. +Run `tools/gen-char-class --help` for further details, or `tools/gen-char-class --space` +to see an example of its output. + +## Project Status +The library requires at least a C11 compiler. So far has been tested on Linux Mint versions +from 19.3 to 22.0, with `gcc` versions from 9.5.0 to 13.2.0 (with either `libc` or `musl`), +and `clang` versions up to 18.1.3; it is also reported to work on ALT Linux 9.1 for Elbrus, with +`lcc` version 1.25.09. diff --git a/3rd/str/snippets.md b/3rd/str/snippets.md new file mode 100644 index 0000000..8528890 --- /dev/null +++ b/3rd/str/snippets.md @@ -0,0 +1,63 @@ +### Code Examples + +Here I provide various (hopefully, useful) functions and code examples that are not included into the +main library. Some examples use non-POSIX and/or compiler-specific features that may or may +not be suitable for a particular project. Also, these snippets were tested while being developed, +but they may break in the future as the library evolves. + +##### `void str_sprintf(str* const dest, const char* fmt, ...)` + +Probably the simplest implementation utilising non-POSIX `asprintf(3)` function: +```C +#define _GNU_SOURCE + +#include "str.h" + +#define str_sprintf(dest, fmt, ...) \ +({ \ + char* ___p; \ + const int ___n = asprintf(&___p, (fmt), ##__VA_ARGS__); \ + str_assign((dest), str_acquire_chars(___p, ___n)); \ +}) +``` +This code does not check for errors. A more standard-conforming implementation would probably go +through `open_memstream(3)` function. + +##### `int str_from_int(str* const dest, const int val)` +```C +int str_from_int(str* const dest, const int val) +{ + char buff[256]; // of some "big enough" size + + return str_cpy(dest, str_ref_chars(buff, snprintf(buff, sizeof(buff), "%d", val))); +} +``` + +This code can also be used as a template for other functions converting from `double`, `struct tm`, etc. + +##### `int str_append(str* const dest, ...)` +```C +#define str_append(dest, ...) \ + ({ str* const ___p = (dest); str_cat(___p, *___p, ##__VA_ARGS__); }) +``` +Test case and usage example: +```C + str s = str_lit("zzz"); + + assert(str_append(&s, str_lit(" "), str_lit("aaa")) == 0); + assert(str_eq(s, str_lit("zzz aaa"))); + + str_free(s); +``` + +##### Using `str` objects with `printf` family of functions + +Since a string object is not guaranteed to refer to a null-terminated string it should be formatted +with explicitly specified length, for example: +```C + str s = ... + + printf("%.*s\n", (int)str_len(s), str_ptr(s)); +``` +_Note:_ The maximum length of the string is limited to `INT_MAX` bytes, and formatting will stop +at the first null byte within the string. diff --git a/3rd/str.c b/3rd/str/str.c similarity index 100% rename from 3rd/str.c rename to 3rd/str/str.c diff --git a/3rd/str.h b/3rd/str/str.h similarity index 100% rename from 3rd/str.h rename to 3rd/str/str.h diff --git a/3rd/str/str_test.c b/3rd/str/str_test.c new file mode 100644 index 0000000..f877916 --- /dev/null +++ b/3rd/str/str_test.c @@ -0,0 +1,907 @@ +/* +BSD 3-Clause License + +Copyright (c) 2020,2021,2022,2023,2024 Maxim Konakov and contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define _POSIX_C_SOURCE 200809L + +#include "str.h" + +#include +#include +#include +#include +#include +#include + +// make sure assert is always enabled +#ifdef NDEBUG +#undef NDEBUG +#endif + +#include + +#define passed printf("passed: %s\n", __func__) + +static +void test_str_lit(void) +{ + const str s = str_lit("ZZZ"); + + assert(str_len(s) == 3); + assert(str_is_ref(s)); + assert(!str_is_owner(s)); + assert(str_eq(s, str_lit("ZZZ"))); + + passed; +} + +static +void test_str_cpy(void) +{ + str_auto s = str_null; + + assert(str_cpy(&s, str_lit("ZZZ")) == 0); + + assert(str_len(s) == 3); + assert(!str_is_ref(s)); + assert(str_is_owner(s)); + assert(str_eq(s, str_lit("ZZZ"))); + assert(*str_end(s) == 0); + + passed; +} + +static +void test_str_clear(void) +{ + str s = str_null; + + assert(str_cpy(&s, str_lit("ZZZ")) == 0); + + assert(str_len(s) == 3); + assert(str_is_owner(s)); + assert(*str_end(s) == 0); + + str_clear(&s); + + assert(str_is_empty(s)); + assert(str_is_ref(s)); + + passed; +} + +static +void test_str_move(void) +{ + str s1 = str_null; + + assert(str_cpy(&s1, str_lit("ZZZ")) == 0); + + str s2 = str_move(&s1); + + assert(str_is_empty(s1)); + assert(str_is_ref(s1)); + + assert(str_is_owner(s2)); + assert(str_eq(s2, str_lit("ZZZ"))); + + str_free(s2); + passed; +} + +static +void test_str_pass(void) +{ + str s1 = str_null; + + assert(str_cpy(&s1, str_lit("ZZZ")) == 0); + + str s2 = str_pass(&s1); + + assert(str_is_ref(s1)); + assert(str_eq(s1, str_lit("ZZZ"))); + + assert(str_is_owner(s2)); + assert(str_eq(s2, str_lit("ZZZ"))); + + str_free(s2); + passed; +} + +static +void test_str_ref(void) +{ + str s = str_ref("ZZZ"); + + assert(str_len(s) == 3); + assert(str_is_ref(s)); + + s = str_ref(s); + + assert(str_is_ref(s)); + assert(str_eq(s, str_lit("ZZZ"))); + + const char* const p = "ZZZ"; + + s = str_ref(p); + + assert(str_is_ref(s)); + assert(str_eq(s, str_lit("ZZZ"))); + + passed; +} + +static +void test_str_cmp(void) +{ + const str_auto s = str_lit("zzz"); + + assert(str_cmp(s, s) == 0); + assert(str_cmp(s, str_lit("zzz")) == 0); + assert(str_cmp(s, str_lit("zz")) > 0); + assert(str_cmp(s, str_lit("zzzz")) < 0); + assert(str_cmp(s, str_null) > 0); + assert(str_cmp(str_null, s) < 0); + assert(str_cmp(str_null, str_null) == 0); + assert(str_eq(s, str_lit("zzz"))); + + passed; +} + +static +void test_str_cmp_ci(void) +{ + const str s = str_lit("zzz"); + + assert(str_cmp_ci(s, s) == 0); + assert(str_cmp_ci(s, str_lit("zzz")) == 0); + assert(str_cmp_ci(s, str_lit("zz")) > 0); + assert(str_cmp_ci(s, str_lit("zzzz")) < 0); + assert(str_cmp_ci(s, str_null) > 0); + assert(str_cmp_ci(str_null, s) < 0); + assert(str_cmp_ci(str_null, str_null) == 0); + assert(str_cmp_ci(s, str_lit("ZZZ")) == 0); + assert(str_cmp_ci(s, str_lit("ZZ")) > 0); + assert(str_cmp_ci(s, str_lit("ZZZZ")) < 0); + assert(str_eq_ci(s, str_lit("ZZZ"))); + + passed; +} + +static +void test_str_acquire(void) +{ + str_auto s = str_acquire(strdup("ZZZ")); + + assert(str_is_owner(s)); + assert(str_eq(s, str_lit("ZZZ"))); + assert(*str_end(s) == 0); + + passed; +} + +static +void test_str_cat(void) +{ + str s = str_null; + + assert(str_cat(&s, str_lit("AAA"), str_lit("BBB"), str_lit("CCC")) == 0); + + assert(str_eq(s, str_lit("AAABBBCCC"))); + assert(str_is_owner(s)); + assert(*str_end(s) == 0); + + assert(str_cat(&s, str_null, str_null, str_null) == 0); // this simply clears the target string + + assert(str_is_empty(s)); + assert(str_is_ref(s)); + + passed; +} + +static +void test_str_join(void) +{ + str s = str_null; + + assert(str_join(&s, str_lit("_"), str_lit("AAA"), str_lit("BBB"), str_lit("CCC")) == 0); + + assert(str_eq(s, str_lit("AAA_BBB_CCC"))); + assert(str_is_owner(s)); + assert(*str_end(s) == 0); + + assert(str_join(&s, str_lit("_"), str_null, str_lit("BBB"), str_lit("CCC")) == 0); + + assert(str_eq(s, str_lit("_BBB_CCC"))); + assert(str_is_owner(s)); + assert(*str_end(s) == 0); + + assert(str_join(&s, str_lit("_"), str_lit("AAA"), str_null, str_lit("CCC")) == 0); + + assert(str_eq(s, str_lit("AAA__CCC"))); + assert(str_is_owner(s)); + assert(*str_end(s) == 0); + + assert(str_join(&s, str_lit("_"), str_lit("AAA"), str_lit("BBB"), str_null) == 0); + + assert(str_eq(s, str_lit("AAA_BBB_"))); + assert(str_is_owner(s)); + assert(*str_end(s) == 0); + + assert(str_join(&s, str_lit("_"), str_null, str_null, str_null) == 0); + + assert(str_eq(s, str_lit("__"))); + assert(str_is_owner(s)); + assert(*str_end(s) == 0); + + assert(str_join(&s, str_null) == 0); // this simply clears the target string + + assert(str_is_empty(s)); + assert(str_is_ref(s)); + + passed; +} + +static +void test_composition(void) +{ + str_auto s = str_lit(", "); + + assert(str_join(&s, s, str_lit("Here"), str_lit("there"), str_lit("and everywhere")) == 0); + assert(str_cat(&s, s, str_lit("...")) == 0); + + assert(str_eq(s, str_lit("Here, there, and everywhere..."))); + assert(str_is_owner(s)); + assert(*str_end(s) == 0); + + passed; +} + +static +void test_sort(void) +{ + str src[] = { str_lit("z"), str_lit("zzz"), str_lit("aaa"), str_lit("bbb") }; + + str_sort_range(str_order_asc, src, sizeof(src)/sizeof(src[0])); + + assert(str_eq(src[0], str_lit("aaa"))); + assert(str_eq(src[1], str_lit("bbb"))); + assert(str_eq(src[2], str_lit("z"))); + assert(str_eq(src[3], str_lit("zzz"))); + + str_sort_range(str_order_desc, src, sizeof(src)/sizeof(src[0])); + + assert(str_eq(src[0], str_lit("zzz"))); + assert(str_eq(src[1], str_lit("z"))); + assert(str_eq(src[2], str_lit("bbb"))); + assert(str_eq(src[3], str_lit("aaa"))); + + passed; +} + +static +void test_sort_ci(void) +{ + str src[] = { str_lit("ZZZ"), str_lit("zzz"), str_lit("aaa"), str_lit("AAA") }; + + str_sort_range(str_order_asc_ci, src, sizeof(src)/sizeof(src[0])); + + assert(str_eq_ci(src[0], str_lit("aaa"))); + assert(str_eq_ci(src[1], str_lit("aaa"))); + assert(str_eq_ci(src[2], str_lit("zzz"))); + assert(str_eq_ci(src[3], str_lit("zzz"))); + + str_sort_range(str_order_desc_ci, src, sizeof(src)/sizeof(src[0])); + + assert(str_eq_ci(src[0], str_lit("zzz"))); + assert(str_eq_ci(src[1], str_lit("zzz"))); + assert(str_eq_ci(src[2], str_lit("aaa"))); + assert(str_eq_ci(src[3], str_lit("aaa"))); + + passed; +} + +static +void test_search(void) +{ + str src[] = { str_lit("z"), str_lit("zzz"), str_lit("aaa"), str_lit("bbb") }; + const size_t count = sizeof(src)/sizeof(src[0]); + + str_sort_range(str_order_asc, src, count); + + assert(str_search_range(src[0], src, count) == &src[0]); + assert(str_search_range(src[1], src, count) == &src[1]); + assert(str_search_range(src[2], src, count) == &src[2]); + assert(str_search_range(src[3], src, count) == &src[3]); + assert(str_search_range(str_lit("xxx"), src, count) == NULL); + + passed; +} + +static +void test_prefix(void) +{ + const str s = str_lit("abcd"); + + assert(str_has_prefix(s, str_null)); + assert(str_has_prefix(s, str_lit("a"))); + assert(str_has_prefix(s, str_lit("ab"))); + assert(str_has_prefix(s, str_lit("abc"))); + assert(str_has_prefix(s, str_lit("abcd"))); + + assert(!str_has_prefix(s, str_lit("zzz"))); + assert(!str_has_prefix(s, str_lit("abcde"))); + + passed; +} + +static +void test_suffix(void) +{ + const str s = str_lit("abcd"); + + assert(str_has_suffix(s, str_null)); + assert(str_has_suffix(s, str_lit("d"))); + assert(str_has_suffix(s, str_lit("cd"))); + assert(str_has_suffix(s, str_lit("bcd"))); + assert(str_has_suffix(s, str_lit("abcd"))); + + assert(!str_has_suffix(s, str_lit("zzz"))); + assert(!str_has_suffix(s, str_lit("_abcd"))); + + passed; +} + +static +void test_cpy_to_fd(void) +{ + FILE* const tmp = tmpfile(); + + assert(tmp != NULL); + assert(str_cpy(fileno(tmp), str_lit("ZZZ")) == 0); + + rewind(tmp); + + char buff[32]; + + assert(fread(buff, 1, sizeof(buff), tmp) == 3); + assert(memcmp(buff, "ZZZ", 3) == 0); + + fclose(tmp); + passed; +} + +static +void test_cpy_to_stream(void) +{ + FILE* const tmp = tmpfile(); + + assert(tmp != NULL); + assert(str_cpy(tmp, str_lit("ZZZ")) == 0); + + assert(fflush(tmp) == 0); + rewind(tmp); + + char buff[32]; + + assert(fread(buff, 1, sizeof(buff), tmp) == 3); + assert(memcmp(buff, "ZZZ", 3) == 0); + + fclose(tmp); + passed; +} + +static +void test_cat_range_to_fd(void) +{ + const str src[] = { + str_lit("aaa"), + str_lit("bbb"), + str_null, + str_lit("ccc"), + str_lit("ddd"), + str_null, + str_null + }; + + const size_t num_items = sizeof(src)/sizeof(src[0]); + + FILE* const tmp = tmpfile(); + + assert(tmp != NULL); + assert(str_cat_range(fileno(tmp), src, num_items) == 0); + + rewind(tmp); + + const char res[] = "aaabbbcccddd"; + const size_t len = sizeof(res) - 1; + char buff[32]; + + assert(fread(buff, 1, sizeof(buff), tmp) == len); + assert(memcmp(buff, res, len) == 0); + + fclose(tmp); + passed; +} + +static +void test_cat_large_range_to_fd(void) +{ + // prepare data + const size_t n = 100000; + str* const src = calloc(n, sizeof(str)); + + assert(src != NULL); + + char buff[100]; + + for(unsigned i = 0; i < n; i++) + assert(str_cpy(&src[i], str_ref_chars(buff, sprintf(buff, "%u\n", i))) == 0); + + // write to file + FILE* const tmp = tmpfile(); + + assert(tmp != NULL); + assert(str_cat_range(fileno(tmp), src, n) == 0); + + // clear input data + for(unsigned i = 0; i < n; ++i) + str_free(src[i]); + + free(src); + + // validate + rewind(tmp); + + char* line = NULL; + size_t cap = 0; + ssize_t len; + int i = 0; + + while((len = getline(&line, &cap, tmp)) >= 0) + assert(atoi(line) == i++); + + assert(i == (int)n); + + // all done + fclose(tmp); + free(line); + passed; +} + +static +void test_cat_range_to_stream(void) +{ + const str src[] = { + str_lit("aaa"), + str_lit("bbb"), + str_null, + str_lit("ccc"), + str_lit("ddd"), + str_null, + str_null + }; + + const size_t num_items = sizeof(src)/sizeof(src[0]); + + FILE* const tmp = tmpfile(); + + assert(tmp != NULL); + assert(str_cat_range(tmp, src, num_items) == 0); + + assert(fflush(tmp) == 0); + rewind(tmp); + + const char res[] = "aaabbbcccddd"; + const size_t len = sizeof(res) - 1; + char buff[32]; + + assert(fread(buff, 1, sizeof(buff), tmp) == len); + assert(memcmp(buff, res, len) == 0); + + fclose(tmp); + passed; +} + +static +void test_join_to_fd(void) +{ + FILE* const tmp = tmpfile(); + + assert(tmp != NULL); + assert(str_join(fileno(tmp), str_lit("_"), str_lit("aaa"), str_lit("bbb"), str_lit("ccc")) == 0); + + rewind(tmp); + + const char res[] = "aaa_bbb_ccc"; + const size_t len = sizeof(res) - 1; + char buff[32]; + + assert(fread(buff, 1, sizeof(buff), tmp) == len); + assert(memcmp(buff, res, len) == 0); + + fclose(tmp); + passed; +} + +static +void test_join_large_range_to_fd(void) +{ + // prepare data + const size_t n = 100000; + str* const src = calloc(n, sizeof(str)); + + assert(src != NULL); + + char buff[100]; + + for(unsigned i = 0; i < n; i++) + assert(str_cpy(&src[i], str_ref_chars(buff, sprintf(buff, "%u", i))) == 0); + + // write to file + FILE* const tmp = tmpfile(); + + assert(tmp != NULL); + assert(str_join_range(fileno(tmp), str_lit("\n"), src, n) == 0); + + // clear input data + for(unsigned i = 0; i < n; ++i) + str_free(src[i]); + + free(src); + + // validate + rewind(tmp); + + char* line = NULL; + size_t cap = 0; + ssize_t len; + int i = 0; + + while((len = getline(&line, &cap, tmp)) >= 0) + assert(atoi(line) == i++); + + assert(i == (int)n); + + // all done + fclose(tmp); + free(line); + passed; +} + +static +void test_join_to_stream(void) +{ + FILE* const tmp = tmpfile(); + + assert(tmp != NULL); + assert(str_join(tmp, str_lit("_"), str_lit("aaa"), str_lit("bbb"), str_lit("ccc")) == 0); + + assert(fflush(tmp) == 0); + rewind(tmp); + + const char res[] = "aaa_bbb_ccc"; + const size_t len = sizeof(res) - 1; + char buff[32]; + + assert(fread(buff, 1, sizeof(buff), tmp) == len); + assert(memcmp(buff, res, len) == 0); + + fclose(tmp); + passed; +} + +static +bool part_pred(const str s) { return str_len(s) < 2; } + +static +void test_partition_range(void) +{ + str src[] = { str_lit("aaa"), str_lit("a"), str_lit("aaaa"), str_lit("z") }; + + assert(str_partition_range(part_pred, src, 1) == 0); + + assert(str_partition_range(part_pred, src, sizeof(src)/sizeof(src[0])) == 2); + assert(str_eq(src[0], str_lit("a"))); + assert(str_eq(src[1], str_lit("z"))); + assert(str_partition_range(part_pred, src, 1) == 1); + + src[0] = str_lit("?"); + src[2] = str_lit("*"); + + assert(str_partition_range(part_pred, src, sizeof(src)/sizeof(src[0])) == 3); + assert(str_eq(src[0], str_lit("?"))); + assert(str_eq(src[1], str_lit("z"))); + assert(str_eq(src[2], str_lit("*"))); + assert(str_eq(src[3], str_lit("aaa"))); + + assert(str_partition_range(part_pred, NULL, 42) == 0); + assert(str_partition_range(part_pred, src, 0) == 0); + + passed; +} + +static +void test_unique_range(void) +{ + str src[] = { + str_lit("zzz"), + str_lit("aaa"), + str_lit("zzz"), + str_lit("bbb"), + str_lit("aaa"), + str_lit("ccc"), + str_lit("ccc"), + str_lit("aaa"), + str_lit("ccc"), + str_lit("zzz") + }; + + assert(str_unique_range(src, sizeof(src)/sizeof(src[0])) == 4); + assert(str_eq(src[0], str_lit("aaa"))); + assert(str_eq(src[1], str_lit("bbb"))); + assert(str_eq(src[2], str_lit("ccc"))); + assert(str_eq(src[3], str_lit("zzz"))); + + passed; +} + +static +void test_from_file(void) +{ + str_auto fname = str_null; + + assert(str_cat(&fname, str_lit("tmp_"), str_ref_chars(__func__, sizeof(__func__) - 1)) == 0); + + FILE* const stream = fopen(str_ptr(fname), "w"); + + assert(stream); + assert(str_join(stream, str_lit(" "), str_lit("aaa"), str_lit("bbb"), str_lit("ccc")) == 0); + assert(fclose(stream) == 0); + + str_auto res = str_null; + + assert(str_from_file(&res, str_ptr(fname)) == 0); + unlink(str_ptr(fname)); + assert(str_eq(res, str_lit("aaa bbb ccc"))); + assert(str_is_owner(res)); + + // test errors + assert(str_from_file(&res, ".") == EISDIR); + assert(str_from_file(&res, "/dev/null") == EOPNOTSUPP); + assert(str_from_file(&res, "does-not-exist") == ENOENT); + + passed; +} + +#ifdef __STDC_UTF_32__ + +static +void test_codepoint_iterator(void) +{ + const str src = str_lit(u8"ะถั‘ะปั‚ั‹ะน"); // means "yellow" in Russian + static const char32_t src32[] = { U'ะถ', U'ั‘', U'ะป', U'ั‚', U'ั‹', U'ะน' }; + size_t i = 0; + char32_t c; + + for_each_codepoint(c, src) + { + assert(i < sizeof(src32)/sizeof(src32[0])); + assert(c == src32[i++]); + } + + assert(c == CPI_END_OF_STRING); + assert(i == sizeof(src32)/sizeof(src32[0])); + + // empty string iteration + c = 0; + + for_each_codepoint(c, str_null) + assert(0); + + assert(c == CPI_END_OF_STRING); + passed; +} + +#endif // ifdef __STDC_UTF_32__ + +static +void test_tok(void) +{ + typedef struct + { + const str src, delim; + const unsigned n_tok; + const str tok[3]; + } test_data; + + static const test_data t[] = + { + { + str_lit("a,b,c"), + str_lit(","), + 3, + { str_lit("a"), str_lit("b"), str_lit("c") } + }, + { + str_lit(",,a,b,,c,"), + str_lit(","), + 3, + { str_lit("a"), str_lit("b"), str_lit("c") } + }, + { + str_lit("aaa;=~bbb~,=ccc="), + str_lit(",;=~"), + 3, + { str_lit("aaa"), str_lit("bbb"), str_lit("ccc") } + }, + { + str_lit(""), + str_lit(","), + 0, + { } + }, + { + str_lit(""), + str_lit(""), + 0, + { } + }, + { + str_lit(",.;,.;;.,;.,"), + str_lit(",.;"), + 0, + { } + }, + { + str_lit("aaa,bbb,ccc"), + str_lit(""), + 1, + { str_lit("aaa,bbb,ccc") } + }, + { + str_lit("aaa,bbb,ccc"), + str_lit(";-="), + 1, + { str_lit("aaa,bbb,ccc") } + } + }; + + for(unsigned i = 0; i < sizeof(t)/sizeof(t[0]); ++i) + { + unsigned tok_count = 0; + + str tok = str_null; + str_tok_state state; + + str_tok_init(&state, t[i].src, t[i].delim); + + while(str_tok(&tok, &state)) + { +// printf("%u-%u: \"%.*s\" %zu\n", +// i, tok_count, (int)str_len(tok), str_ptr(tok), str_len(tok)); +// fflush(stdout); + + assert(tok_count < t[i].n_tok); + assert(str_eq(tok, t[i].tok[tok_count])); + + ++tok_count; + } + + assert(tok_count == t[i].n_tok); + } + + passed; +} + +static +void test_partition(void) +{ + typedef struct + { + const bool res; + const str src, patt, pref, suff; + } test_data; + + static const test_data t[] = + { + { true, str_lit("...abc..."), str_lit("abc"), str_lit("..."), str_lit("...") }, + { true, str_lit("......abc"), str_lit("abc"), str_lit("......"), str_null }, + { true, str_lit("abc......"), str_lit("abc"), str_null, str_lit("......") }, + + { true, str_lit("...a..."), str_lit("a"), str_lit("..."), str_lit("...") }, + { true, str_lit("......a"), str_lit("a"), str_lit("......"), str_null }, + { true, str_lit("a......"), str_lit("a"), str_null, str_lit("......") }, + + { false, str_lit("zzz"), str_null, str_lit("zzz"), str_null }, + { false, str_null, str_lit("zzz"), str_null, str_null }, + { false, str_null, str_null, str_null, str_null }, + + { false, str_lit("...zzz..."), str_lit("xxx"), str_lit("...zzz..."), str_null }, + { false, str_lit("...xxz..."), str_lit("xxx"), str_lit("...xxz..."), str_null }, + { true, str_lit("...xxz...xxx."), str_lit("xxx"), str_lit("...xxz..."), str_lit(".") }, + { true, str_lit(u8"...ั†ะธั„ั€ั‹___"), str_lit(u8"ั†ะธั„ั€ั‹"), str_lit("..."), str_lit("___") } + }; + + for(unsigned i = 0; i < sizeof(t)/sizeof(t[0]); ++i) + { + str pref = str_lit("???"), suff = str_lit("???"); + + assert(str_partition(t[i].src, t[i].patt, &pref, &suff) == t[i].res); + assert(str_eq(pref, t[i].pref)); + assert(str_eq(suff, t[i].suff)); + } + + passed; +} + +int main(void) +{ + // tests + test_str_lit(); + test_str_cpy(); + test_str_clear(); + test_str_move(); + test_str_pass(); + test_str_ref(); + test_str_cmp(); + test_str_cmp_ci(); + test_str_acquire(); + test_str_cat(); + test_str_join(); + test_composition(); + test_sort(); + test_sort_ci(); + test_search(); + test_prefix(); + test_suffix(); + test_cpy_to_fd(); + test_cpy_to_stream(); + test_cat_range_to_fd(); + test_cat_large_range_to_fd(); + test_cat_range_to_stream(); + test_join_to_fd(); + test_join_large_range_to_fd(); + test_join_to_stream(); + test_partition_range(); + test_unique_range(); + test_from_file(); + test_tok(); + test_partition(); + +#ifdef __STDC_UTF_32__ + assert(setlocale(LC_ALL, "C.UTF-8")); + + test_codepoint_iterator(); +#endif + + return puts("OK.") < 0; +} diff --git a/3rd/str/tools/file-to-str b/3rd/str/tools/file-to-str new file mode 100755 index 0000000..82366af --- /dev/null +++ b/3rd/str/tools/file-to-str @@ -0,0 +1,30 @@ +#!/bin/sh + +die() { + echo >&2 "$@" + exit 1 +} + +[ $# -eq 2 ] || die "Usage: $(basename "$0") FILE VAR-NAME" +[ -f "$1" ] || die "$0: file \"$1\" does not exist, or is not a file." + +set -e + +cat << EOF +// AUTOMATICALLY GENERATED FILE - DO NOT EDIT + +// source file: $1 + +#include "str.h" + +static +const char _bytes[] = { +EOF + +od -v -w12 -A n -t x1 "$1" | sed -E 's/\<([[:xdigit:]]{2})\>/0x\1,/g' + +cat << EOF + 0x00 }; + +const str $2 = (const str){ _bytes, _ref_info(sizeof(_bytes) - 1) }; +EOF diff --git a/3rd/str/tools/gen_char_class.c b/3rd/str/tools/gen_char_class.c new file mode 100644 index 0000000..9c829d1 --- /dev/null +++ b/3rd/str/tools/gen_char_class.c @@ -0,0 +1,209 @@ +/* +BSD 3-Clause License + +Copyright (c) 2020,2021,2022,2023,2024 Maxim Konakov and contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include +#include + +// platform checks +#ifndef __STDC_ISO_10646__ +#error "this platform does not support UNICODE (__STDC_ISO_10646__ is not defined)" +#endif + +#if __SIZEOF_WCHAR_T__ < 4 || __SIZEOF_WINT_T__ < 4 +#error "this platform does not have a usable wchar_t (both sizeof(wchar_t) and sizeof(wint_t) should be at least 4)" +#endif + +// i/o helpers +static __attribute((noinline, noreturn)) +void die(const char* const msg) +{ + perror(msg); + exit(1); +} + +#define do_printf(fmt, ...) \ + do { \ + if(printf(fmt, ##__VA_ARGS__) < 0) \ + die("error writing output"); \ + } while(0) + +#define do_write(str) \ + do { \ + if(fwrite((str), 1, sizeof(str) - 1, stdout) != sizeof(str) - 1) \ + die("error writing output"); \ + } while(0) + +// char type selector (isw*() functions) +typedef int (*selector)(wint_t wc); + +// option parser +static __attribute__((noreturn)) +void usage_exit(void) +{ + static const char usage[] = + "Usage: gen-char-class SELECTOR\n" + " Generate a character classification C function that does the same as its\n" + " isw*() counterpart under the current locale as specified by LC_ALL\n" + " environment variable. SELECTOR specifies the classification function\n" + " to generate, it must be any one of:\n" + " --alnum -> use iswalnum()\n" + " --alpha -> use iswalpha()\n" + " --blank -> use iswblank()\n" + " --cntrl -> use iswcntrl()\n" + " --digit -> use iswdigit()\n" + " --graph -> use iswgraph()\n" + " --lower -> use iswlower()\n" + " --print -> use iswprint()\n" + " --punct -> use iswpunct()\n" + " --space -> use iswspace()\n" + " --upper -> use iswupper()\n" + " --xdigit -> use iswxdigit()\n"; + + fputs(usage, stderr); + exit(1); +} + +static +selector fn; + +static +const char* fn_name; + +static +const char* loc; + +#define ARG(name) \ + if(strcmp(argv[1], "--" #name) == 0) { \ + fn = isw ## name; fn_name = #name; \ + return; \ + } + +static +void read_opts(int argc, char* const argv[]) +{ + if(argc != 2) + usage_exit(); + + ARG(alnum) + ARG(alpha) + ARG(blank) + ARG(cntrl) + ARG(digit) + ARG(graph) + ARG(lower) + ARG(print) + ARG(punct) + ARG(space) + ARG(upper) + ARG(xdigit) + + if(strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) + usage_exit(); + + fprintf(stderr, "unknown option: \"%s\"\n", argv[1]); + exit(1); +} + +#undef ARG + +// range printing +static +void print_range(const wint_t first, const wint_t last) +{ + if(first == last) + do_printf("\t\tcase 0x%.2X:\n", first); + else + do_printf("\t\tcase 0x%.2X ... 0x%.2X:\n", first, last); +} + +// header/footer +static +const char header[] = + "/* LC_ALL = \"%s\" */\n" + "bool is_%s(const char32_t c)\n" + "{\n" + " switch(c)\n" + " {\n"; + +static +const char footer[] = + " return true;\n" + " default:\n" + " return false;\n" + " }\n" + "}\n"; + +// main +#define UTF32_MAX_CHAR 0x10ffff + +int main(int argc, char* const argv[]) +{ + read_opts(argc, argv); + + loc = getenv("LC_ALL"); + + if(loc && !setlocale(LC_ALL, loc)) + die("cannot change current locale"); + + errno = 0; + do_printf(header, loc ? loc : "", fn_name); + + wint_t first = 0; + bool in_range = false; + + for(wint_t c = 0; c <= UTF32_MAX_CHAR; ++c) + { + const bool match = (fn(c) != 0); + + if(in_range && !match) + print_range(first, c - 1); + else if(!in_range && match) + first = c; + + in_range = match; + } + + if(in_range) + print_range(first, UTF32_MAX_CHAR); + + do_write(footer); + + if(fflush(stdout)) + die("error writing output"); + + return 0; +}