commit 3a2138a2defff7181e56b5b18c555b850e6bc55a
parent 62e8c3110adf6391dc19afc8882dcd5264afdbc9
Author: Vincent Forest <vincent.forest@meso-star.com>
Date: Fri, 9 Jan 2026 11:01:27 +0100
Compress line parameters
The lines are compressed to limit memory usage. The memory allocation
policy is also modified to avoid the overhead associated with using a
dynamic array to store all the data.
On loading, lines are first stored in a fixed size chunk that, once once
filled, is compressed with zlib. The compressed data is then stored in a
fixed-size memory block. Each chunk can be accessed and decompressed
separately. The memory descriptors for the compressed chunks, i.e., the
data defining the location where the compressed data is stored in the
memory locks, are stored in a dynamic array in the order in which the
lines are loaded. Similarly, the memory blocks that contain the
compressed data are also listed in a dynamic array in the order in which
the lines are read. Therefore, the index of a line is sufficient to
index the chunk in which the line is stored, and finally the memory
block in which its parameters are compressed.
Using memory blocks stored in dynamic arrays rather than a simple
dynamic array of contiguous bytes is motivated by the issue of
additional memory overhead associated with the use of dynamic arrays. In
the worst case, the memory overhead here is equal to twice the number of
blocks multiplied by the size of a pointer, compared to twice the size
required to store all the lines.
Finally, a cache is used to speed up access to lines, which now need to
be decompressed. This cache stores the decompressed chunks containing
the most recently accessed lines. As a result, the shtr_line_list_at
function can no longer take a constant list as a parameter, because it
needs to update its internal cache. Note that the cache is actually very
naive and only improves efficiency for linear accesses: it stores only a
single entry, namely the last chunk of lines accessed. Furthermore, the
use of this cache is also very naive, so that two threads could
decompress the same chunk to update the cache. This is in fact an
initial proposal whose sole purpose is to provide a complete functional
implementation of the shtr API, which now compresses lines.
Preliminary tests indicate that memory usage is reduced by approximately
2, while loading time is increased between 2 or 3 due to compression.
This increase in loading time should not pose a major problem, as once
created, the compressed list of lines should be serialized so that it
can be reused as is in calculations. No tests have been performed on
access efficiency, which should currently be dramatic for random access
(see above).
Diffstat:
10 files changed, 641 insertions(+), 66 deletions(-)
diff --git a/Makefile b/Makefile
@@ -33,6 +33,7 @@ all: library tests utils
################################################################################
SRC = \
src/shtr.c \
+ src/shtr_cache.c \
src/shtr_isotope_metadata.c \
src/shtr_line_list.c \
src/shtr_param.c
@@ -65,6 +66,7 @@ libshtr.o: $(OBJ)
.config: config.mk
$(PKG_CONFIG) --atleast-version $(RSYS_VERSION) rsys
+ $(PKG_CONFIG) --atleast-version $(ZLIB_VERSION) zlib
echo "config done" > $@
.SUFFIXES: .c .d .o
@@ -110,6 +112,7 @@ pkg:
sed -e 's#@PREFIX@#$(PREFIX)#g' \
-e 's#@VERSION@#$(VERSION)#g' \
-e 's#@RSYS_VERSION@#$(RSYS_VERSION)#g' \
+ -e 's#@ZLIB_VERSION@#$(ZLIB_VERSION)#g' \
shtr.pc.in > shtr.pc
shtr-local.pc: shtr.pc.in
@@ -118,6 +121,7 @@ shtr-local.pc: shtr.pc.in
-e 's#^libdir=.*#libdir=./#' \
-e 's#@VERSION@#$(VERSION)#g' \
-e 's#@RSYS_VERSION@#$(RSYS_VERSION)#g' \
+ -e 's#@ZLIB_VERSION@#$(ZLIB_VERSION)#g' \
shtr.pc.in > $@
install: library pkg utils
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ database](https://hitran.org/).
- POSIX make
- pkg-config
- RSys
+- zlib
## Installation
diff --git a/config.mk b/config.mk
@@ -29,9 +29,10 @@ PCFLAGS_STATIC = --static
PCFLAGS = $(PCFLAGS_$(LIB_TYPE))
RSYS_VERSION = 0.14
+ZLIB_VERSION = 1
-INCS = $$($(PKG_CONFIG) $(PCFLAGS) --cflags rsys)
-LIBS = $$($(PKG_CONFIG) $(PCFLAGS) --libs rsys)
+INCS = $$($(PKG_CONFIG) $(PCFLAGS) --cflags rsys zlib)
+LIBS = $$($(PKG_CONFIG) $(PCFLAGS) --libs rsys zlib)
################################################################################
# Compilation options
diff --git a/shtr.pc.in b/shtr.pc.in
@@ -3,6 +3,7 @@ includedir=${prefix}/include
libdir=${prefix}/lib
Requires: rsys >= @RSYS_VERSION@
+Requieres.private: zlib >= @ZLIB_VERSION@
Name: Star-STL
Description: Star HITRAN library
Version: @VERSION@
diff --git a/src/shtr.h b/src/shtr.h
@@ -274,7 +274,7 @@ shtr_line_list_get_size
SHTR_API res_T
shtr_line_list_at
- (const struct shtr_line_list* list,
+ (struct shtr_line_list* list,
const size_t i,
struct shtr_line* line);
diff --git a/src/shtr_cache.c b/src/shtr_cache.c
@@ -0,0 +1,143 @@
+/* Copyright (C) 2022, 2025, 2026 |Méso|Star> (contact@meso-star.com)
+ * Copyright (C) 2025, 2026 Université de Lorraine
+ * Copyright (C) 2022 Centre National de la Recherche Scientifique
+ * Copyright (C) 2022 Université Paul Sabatier
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "shtr_c.h"
+#include "shtr_cache.h"
+
+#include <rsys/cstr.h>
+#include <rsys/mutex.h>
+#include <rsys/ref_count.h>
+
+#include <string.h> /* memcpy */
+
+#define CHUNK_ID_NONE SIZE_MAX
+
+/* Simple, dumb cache structure storing uncompressed lines from the last chunk
+ * accessed. It should improve linear access performance, but not random access
+ * performance, which will be disastrous because most accesses will require
+ * decompressing an entire block of lines, only one of which will be accessed
+ * before the block is discarded.
+ *
+ * TODO: implement a more general LRU cache */
+struct cache {
+ size_t chunk_id;
+ struct line lines[NLINES_PER_CHUNK];
+
+ struct mutex* mutex;
+ struct shtr* shtr;
+ ref_T ref;
+};
+
+/*******************************************************************************
+ * Helper functions
+ ******************************************************************************/
+static void
+release_cache(ref_T* ref)
+{
+ struct cache* cache = CONTAINER_OF(ref, struct cache, ref);
+ struct shtr* shtr = NULL;
+ ASSERT(ref);
+ shtr = cache->shtr;
+ if(cache->mutex) mutex_destroy(cache->mutex);
+ MEM_RM(shtr->allocator, cache);
+ SHTR(ref_put(shtr));
+}
+
+/*******************************************************************************
+ * Local functions
+ ******************************************************************************/
+res_T
+cache_create(struct shtr* shtr, struct cache** out_cache)
+{
+ struct cache* cache = NULL;
+ res_T res = RES_OK;
+
+ ASSERT(shtr && out_cache); /* Pre-conditions */
+
+ cache = MEM_CALLOC(shtr->allocator, 1, sizeof(*cache));
+ if(!cache) { res = RES_MEM_ERR; goto error; }
+ ref_init(&cache->ref);
+ SHTR(ref_get(shtr));
+ cache->shtr = shtr;
+ cache->chunk_id = CHUNK_ID_NONE;
+
+ cache->mutex = mutex_create();
+ if(!cache->mutex) { res = RES_MEM_ERR; goto error; }
+
+exit:
+ *out_cache = cache;
+ return res;
+error:
+ ERROR(shtr, "Error creating line cache -- %s\n", res_to_cstr(res));
+ if(cache) { cache_ref_put(cache); cache = NULL; }
+ goto exit;
+}
+
+void
+cache_ref_get(struct cache* cache)
+{
+ ASSERT(cache);
+ ref_get(&cache->ref);
+}
+
+void
+cache_ref_put(struct cache* cache)
+{
+ ASSERT(cache);
+ ref_put(&cache->ref, release_cache);
+}
+
+res_T
+cache_get_line
+ (struct cache* cache,
+ const size_t line_id,
+ struct line* line)
+{
+ const size_t chunk_id = line_id / NLINES_PER_CHUNK;
+ const size_t chunk_line_id = line_id % NLINES_PER_CHUNK;
+ res_T res = RES_OK;
+
+ ASSERT(cache && line);
+ ASSERT(chunk_id != CHUNK_ID_NONE && line_id < NLINES_PER_CHUNK);
+
+ mutex_lock(cache->mutex);
+ if(cache->chunk_id != chunk_id) {
+ res = RES_BAD_ARG;
+ } else {
+ *line = cache->lines[chunk_line_id];
+ }
+ mutex_unlock(cache->mutex);
+
+ return res;
+}
+
+void
+cache_put_chunk
+ (struct cache* cache,
+ const size_t chunk_id,
+ const struct line lines[NLINES_PER_CHUNK])
+{
+ ASSERT(cache && chunk_id != CHUNK_ID_NONE && lines);
+
+ mutex_lock(cache->mutex);
+ if(cache->chunk_id != chunk_id) {
+ cache->chunk_id = chunk_id;
+ memcpy(cache->lines, lines, sizeof(struct line)*NLINES_PER_CHUNK);
+ }
+ mutex_unlock(cache->mutex);
+}
diff --git a/src/shtr_cache.h b/src/shtr_cache.h
@@ -0,0 +1,54 @@
+/* Copyright (C) 2022, 2025, 2026 |Méso|Star> (contact@meso-star.com)
+ * Copyright (C) 2025, 2026 Université de Lorraine
+ * Copyright (C) 2022 Centre National de la Recherche Scientifique
+ * Copyright (C) 2022 Université Paul Sabatier
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef SHTR_CACHE_H
+#define SHTR_CACHE_H
+
+#include "shtr_line_list_c.h"
+
+/* Forward declarations */
+struct shtr;
+struct cache;
+
+extern LOCAL_SYM res_T
+cache_create
+ (struct shtr* shtr,
+ struct cache** cache);
+
+extern LOCAL_SYM void
+cache_ref_get
+ (struct cache* cache);
+
+extern LOCAL_SYM void
+cache_ref_put
+ (struct cache* cache);
+
+/* Returns RES_BAD_ARG if the line is not in the cache */
+extern LOCAL_SYM res_T
+cache_get_line
+ (struct cache* cache,
+ const size_t line_id,
+ struct line* line);
+
+extern LOCAL_SYM void
+cache_put_chunk
+ (struct cache* cache,
+ const size_t chunk_id,
+ const struct line lines[NLINES_PER_CHUNK]);
+
+#endif /* SHTR_CACHE_H */
diff --git a/src/shtr_line_list.c b/src/shtr_line_list.c
@@ -16,14 +16,177 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>. */
-#include "shtr.h"
#include "shtr_c.h"
+#include "shtr_cache.h"
#include "shtr_line_list_c.h"
#include "shtr_param.h"
#include <rsys/cstr.h>
#include <rsys/text_reader.h>
+/* Maximum size of a compressed block, which in the worst case could correspond
+ * to the initial block size plus an overhead of 6 bytes, in addition to 5 bytes
+ * per 16 KB of uncompressed data (see https://www.zlib.net/zlib_tech.html) */
+#define ZCHUNK_MAX_SIZE (CHUNK_SIZE + 6 + (5*(CHUNK_SIZE+16383/*ceil*/)/16384))
+
+/*******************************************************************************
+ * Compression API
+ ******************************************************************************/
+struct zctx {
+ struct line* lines; /* Uncompressed Lines */
+ size_t nlines; /* Number of uncompressed lines */
+
+ struct line last_line; /* Last line added. Used to check the order of lines */
+
+ char* zlines; /* Compressed lines */
+
+ z_stream stream; /* zlib */
+ int zlib_is_init;
+
+ struct shtr* shtr;
+};
+static const struct zctx ZCTX_NULL = {0};
+
+static voidpf
+zalloc_func(voidpf opaque, uInt items, uInt size)
+{
+ ASSERT(opaque);
+ return MEM_CALLOC((struct mem_allocator*)opaque, items, size);
+}
+
+static void
+zfree_func(voidpf opaque, voidpf address)
+{
+ ASSERT(opaque);
+ MEM_RM((struct mem_allocator*)opaque, address);
+}
+
+static void
+zctx_release(struct zctx* zctx)
+{
+ ASSERT(zctx);
+ if(zctx->lines) MEM_RM(zctx->shtr->allocator, zctx->lines);
+ if(zctx->zlines) MEM_RM(zctx->shtr->allocator, zctx->zlines);
+ if(zctx->zlib_is_init) deflateEnd(&zctx->stream);
+ SHTR(ref_put(zctx->shtr));
+}
+
+static res_T
+zctx_init(struct zctx* zctx, struct shtr* shtr)
+{
+ int ret = Z_OK;
+ res_T res = RES_OK;
+ ASSERT(zctx && shtr);
+
+ *zctx = ZCTX_NULL;
+
+ SHTR(ref_get(shtr));
+ zctx->shtr = shtr;
+ zctx->nlines = 0;
+
+ /* Allocate memory of uncompressed data */
+ zctx->lines = MEM_CALLOC
+ (zctx->shtr->allocator, NLINES_PER_CHUNK, sizeof(*zctx->lines));
+ if(!zctx->lines) { res = RES_MEM_ERR; goto error; }
+
+ /* Allocate memory of compressed data */
+ zctx->zlines = MEM_ALLOC(zctx->shtr->allocator, ZCHUNK_MAX_SIZE);
+ if(!zctx->zlines) { res = RES_MEM_ERR; goto error; }
+
+ /* Initialize zlib */
+ zctx->stream.zalloc = zalloc_func;
+ zctx->stream.zfree = zfree_func;
+ zctx->stream.opaque = zctx->shtr->allocator;
+ ret = deflateInit(&zctx->stream, Z_DEFAULT_COMPRESSION);
+ if(ret != Z_OK) { res = RES_UNKNOWN_ERR; goto error; }
+ zctx->zlib_is_init = 1;
+
+exit:
+ return res;
+error:
+ zctx_release(zctx);
+ goto exit;
+}
+
+static res_T
+zctx_deflate(struct zctx* zctx, struct shtr_line_list* list)
+{
+ struct zchunk zchunk = ZCHUNK_NULL__;
+ char* block = NULL;
+ size_t sz_total = 0;
+ size_t nblocks = 0;
+ size_t n = 0;
+ int ret = 0;
+ res_T res = RES_OK;
+
+ ASSERT(zctx && list);
+
+ if(!zctx->nlines) goto exit; /* Nothing to do */
+
+ /* Setup input/output for zlib */
+ zctx->stream.next_in = (unsigned char*)zctx->lines;
+ zctx->stream.avail_in = (uInt)(zctx->nlines * sizeof(*zctx->lines));
+ zctx->stream.next_out = (unsigned char*)zctx->zlines;
+ zctx->stream.avail_out = ZCHUNK_MAX_SIZE;
+
+ /* Compress */
+ ret = deflate(&zctx->stream, Z_FULL_FLUSH);
+ if(ret != Z_OK) { res = RES_UNKNOWN_ERR; goto error; }
+
+ /* Calculate the size after compression */
+ zchunk.size = ZCHUNK_MAX_SIZE - zctx->stream.avail_out;
+
+ /* Calculate the total size already allocated for compressed lines */
+ nblocks = darray_charp_size_get(&list->blocks);
+ sz_total = nblocks * BLOCK_SIZE;
+
+ /* Check that the last memory block has enough space to store the compressed
+ * chunk */
+ n = darray_zchunk_size_get(&list->zchunks);
+ if(n) { /* Is there a block? */
+ struct zchunk* prev_chunk = &darray_zchunk_data_get(&list->zchunks)[n-1];
+ size_t sz_in_use = prev_chunk->offset + prev_chunk->size;
+ size_t sz_remain = sz_total - sz_in_use;
+
+ if(sz_remain > zchunk.size) {
+ zchunk.offset = sz_in_use;
+ block = darray_charp_data_get(&list->blocks)[nblocks-1];
+ }
+ }
+
+ /* No memory available. Allocate a new block */
+ if(!block) {
+ block = MEM_CALLOC(list->shtr->allocator, 1, BLOCK_SIZE);
+ if(!block) { res = RES_MEM_ERR; goto error; }
+
+ res = darray_charp_push_back(&list->blocks, &block);
+ if(res != RES_OK) goto error;
+
+ zchunk.offset = sz_total;
+ }
+
+ /* Register the chunk */
+ res = darray_zchunk_push_back(&list->zchunks, &zchunk);
+ if(res != RES_OK) goto error;
+
+ /* Save compressed chunk data */
+ memcpy(block + zchunk.offset % BLOCK_SIZE, zctx->zlines, zchunk.size);
+
+ /* Update the number of fully recorded lines,
+ * i.e., compressed and stored in the list */
+ list->nlines += zctx->nlines;
+
+ /* No lines waiting for compression. */
+ zctx->nlines = 0;
+
+exit:
+ return res;
+error:
+ ERROR(list->shtr, "Error while compressing lines -- %s\n",
+ zctx->stream.msg ? zctx->stream.msg : res_to_cstr(res));
+ goto exit;
+}
+
/*******************************************************************************
* Helper functions
******************************************************************************/
@@ -42,6 +205,7 @@ static res_T
create_line_list(struct shtr* shtr, struct shtr_line_list** out_list)
{
struct shtr_line_list* list = NULL;
+ int ret = Z_OK; /* zlib */
res_T res = RES_OK;
ASSERT(shtr && out_list);
@@ -54,9 +218,25 @@ create_line_list(struct shtr* shtr, struct shtr_line_list** out_list)
ref_init(&list->ref);
SHTR(ref_get(shtr));
list->shtr = shtr;
- darray_line_init(shtr->allocator, &list->lines);
+ darray_zchunk_init(shtr->allocator, &list->zchunks);
+ darray_charp_init(shtr->allocator, &list->blocks);
list->info = SHTR_LINE_LIST_INFO_NULL;
+ res = cache_create(shtr, &list->cache);
+ if(res != RES_OK) goto error;
+
+ /* Initialize zlib */
+ list->z_stream.zalloc = zalloc_func;
+ list->z_stream.zfree = zfree_func;
+ list->z_stream.opaque = list->shtr->allocator;
+ ret = inflateInit(&list->z_stream);
+ if(ret != Z_OK) {
+ res = RES_UNKNOWN_ERR;
+ ERROR(shtr, "Error intializing line decompressor -- %s\n", res_to_cstr(res));
+ goto error;
+ }
+ list->zlib_is_init = 1;
+
exit:
*out_list = list;
return res;
@@ -144,19 +324,22 @@ static res_T
register_line
(struct shtr_line_list* list,
const struct txtrdr* txtrdr,
- const struct shtr_line* line)
+ const struct shtr_line* line,
+ struct zctx* zctx)
{
struct shtr_line ln = SHTR_LINE_NULL;
struct line ln_encoded = LINE_NULL;
res_T res = RES_OK;
- ASSERT(list && txtrdr && line); /* Pre-conditions */
+
+ /* Pre-conditions */
+ ASSERT(list && txtrdr && line);
+ ASSERT(zctx && zctx->nlines < NLINES_PER_CHUNK);
line_encode(line, &ln_encoded);
- if(darray_line_size_get(&list->lines)) {
- const struct line* last_ln = darray_line_cdata_get(&list->lines)
- + darray_line_size_get(&list->lines) - 1;
- if(last_ln->wavenumber > line->wavenumber) {
+ /* Check if a line has been saved. If so, ensure that the lines are sorted */
+ if(darray_zchunk_size_get(&list->zchunks) || zctx->nlines) {
+ if(zctx->last_line.wavenumber > ln_encoded.wavenumber) {
ERROR(list->shtr,
"%s:%lu: lines are not sorted in ascending order wrt their wavenumber.\n",
txtrdr_get_name(txtrdr), txtrdr_get_line_num(txtrdr));
@@ -165,12 +348,14 @@ register_line
}
}
- res = darray_line_push_back(&list->lines, &ln_encoded);
- if(res != RES_OK) {
- ERROR(list->shtr,
- "%s:%lu: error storing the line -- %s.\n",
- txtrdr_get_name(txtrdr), txtrdr_get_line_num(txtrdr), res_to_cstr(res));
- goto error;
+ zctx->last_line = ln_encoded;
+ zctx->lines[zctx->nlines] = ln_encoded;
+ zctx->nlines += 1;
+
+ /* The chunk is full. Compress it */
+ if(zctx->nlines == NLINES_PER_CHUNK) {
+ res = zctx_deflate(zctx, list);
+ if(res != RES_OK) goto error;
}
line_decode(&ln_encoded, &ln);
@@ -308,13 +493,17 @@ load_stream
const char* name,
struct shtr_line_list** out_lines)
{
+ struct zctx zctx = ZCTX_NULL;
struct shtr_line_list* list = NULL;
struct txtrdr* txtrdr = NULL;
res_T res = RES_OK;
+
ASSERT(shtr && stream && name && out_lines);
res = create_line_list(shtr, &list);
if(res != RES_OK) goto error;
+ res = zctx_init(&zctx, shtr);
+ if(res != RES_OK) goto error;
res = txtrdr_stream(list->shtr->allocator, stream, name,
0/*No comment char*/, &txtrdr);
@@ -324,6 +513,7 @@ load_stream
goto error;
}
+
for(;;) {
struct shtr_line ln = SHTR_LINE_NULL;
@@ -339,12 +529,17 @@ load_stream
res = parse_line(list, txtrdr, &ln);
if(res != RES_OK) goto error;
- res = register_line(list, txtrdr, &ln);
+ res = register_line(list, txtrdr, &ln, &zctx);
if(res != RES_OK) goto error;
}
+ /* Ensure that remaining lines are compressed and stored */
+ res = zctx_deflate(&zctx, list);
+ if(res != RES_OK) goto error;
+
exit:
if(txtrdr) txtrdr_ref_put(txtrdr);
+ zctx_release(&zctx);
*out_lines = list;
return res;
error:
@@ -355,15 +550,69 @@ error:
goto exit;
}
+static res_T
+decompress_zchunk
+ (struct shtr_line_list* list,
+ const size_t chunk_id,
+ struct line lines[NLINES_PER_CHUNK])
+{
+ const struct zchunk* zchunk = NULL;
+ char* block = NULL;
+ size_t block_id = 0;
+ size_t block_offset = 0;
+ int ret = Z_OK; /* zlib */
+ res_T res = RES_OK;
+
+ ASSERT(list && lines && chunk_id < darray_zchunk_size_get(&list->zchunks));
+
+ zchunk = darray_zchunk_cdata_get(&list->zchunks) + chunk_id;
+ block_id = zchunk->offset / BLOCK_SIZE;
+ block_offset = zchunk->offset % BLOCK_SIZE;
+
+ block = darray_charp_cdata_get(&list->blocks)[block_id];
+
+ list->z_stream.next_in = (unsigned char*)(block + block_offset);
+ list->z_stream.avail_in = (uInt)zchunk->size;
+ list->z_stream.next_out = (unsigned char*)lines;
+ list->z_stream.avail_out = (uInt)(sizeof(struct line)*NLINES_PER_CHUNK);
+ ret = inflate(&list->z_stream, Z_SYNC_FLUSH);
+ if(ret != Z_OK) {
+ ASSERT(list->z_stream.msg);
+ ERROR(list->shtr, "Error decompressing the chunk of lines -- %s\n",
+ list->z_stream.msg);
+ res = RES_UNKNOWN_ERR;
+ goto error;
+ }
+
+ CHK(inflateReset(&list->z_stream) == Z_OK);
+
+exit:
+ return res;
+error:
+ goto exit;
+}
+
static void
release_lines(ref_T * ref)
{
struct shtr* shtr = NULL;
- struct shtr_line_list* list = CONTAINER_OF
- (ref, struct shtr_line_list, ref);
+ struct shtr_line_list* list = CONTAINER_OF(ref, struct shtr_line_list, ref);
+ char** blocks = NULL;
+ size_t i=0, n=0;
+
ASSERT(ref);
+
shtr = list->shtr;
- darray_line_release(&list->lines);
+
+ if(list->cache) cache_ref_put(list->cache);
+ if(list->zlib_is_init) inflateEnd(&list->z_stream);
+
+ n = darray_charp_size_get(&list->blocks);
+ blocks = darray_charp_data_get(&list->blocks);
+ FOR_EACH(i, 0, n) { if(blocks[i]) MEM_RM(shtr->allocator, blocks[i]); }
+
+ darray_zchunk_release(&list->zchunks);
+ darray_charp_release(&list->blocks);
MEM_RM(shtr->allocator, list);
SHTR(ref_put(shtr));
}
@@ -416,7 +665,8 @@ shtr_line_list_create_from_stream
struct shtr_line_list** out_list)
{
struct shtr_line_list* list = NULL;
- size_t nlines;
+ char** blocks = NULL;
+ size_t i=0, n=0;
int version = 0;
res_T res = RES_OK;
@@ -437,11 +687,12 @@ shtr_line_list_create_from_stream
} else { \
res = RES_UNKNOWN_ERR; \
} \
- ERROR(shtr, "%s: error reading isotope metadata -- %s.\n", \
- FUNC_NAME, res_to_cstr(res)); \
+ ERROR(shtr, \
+ "%s: error reading line list -- %s.\n", FUNC_NAME, res_to_cstr(res)); \
goto error; \
} \
} (void)0
+
READ(&version, 1);
if(version != SHTR_LINE_LIST_VERSION) {
ERROR(shtr,
@@ -452,25 +703,33 @@ shtr_line_list_create_from_stream
goto error;
}
- READ(&nlines, 1);
- res = darray_line_resize(&list->lines, nlines);
- if(res != RES_OK) {
- ERROR(shtr, "%s: error allocating the line list -- %s.\n",
- FUNC_NAME, res_to_cstr(res));
- goto error;
+ READ(&list->nlines, 1);
+
+ /* Memory descriptor of compressed chunks */
+ READ(&n, 1);
+ if((res = darray_zchunk_resize(&list->zchunks, n)) != RES_OK) goto error;
+ READ(darray_zchunk_data_get(&list->zchunks), n);
+
+ /* Compressed data stored in memory blocks */
+ READ(&n, 1);
+ if((res = darray_charp_resize(&list->blocks, n)) != RES_OK) goto error;
+ blocks = darray_charp_data_get(&list->blocks);
+ FOR_EACH(i, 0, n) {
+ blocks[i] = MEM_ALLOC(list->shtr->allocator, BLOCK_SIZE);
+ if(!blocks[i]) { res = RES_MEM_ERR; goto error; }
+ READ(blocks[i], BLOCK_SIZE);
}
- READ(darray_line_data_get(&list->lines), nlines);
+ /* Informations on line parameters */
+ READ(&list->info, 1);
+
#undef READ
exit:
if(out_list) *out_list = list;
return res;
error:
- if(list) {
- SHTR(line_list_ref_put(list));
- list = NULL;
- }
+ if(list) { SHTR(line_list_ref_put(list)); list = NULL; }
goto exit;
}
@@ -496,25 +755,40 @@ shtr_line_list_get_size
size_t* nlines)
{
if(!list || !nlines) return RES_BAD_ARG;
- *nlines = darray_line_size_get(&list->lines);
+ *nlines = list->nlines;
return RES_OK;
}
res_T
shtr_line_list_at
- (const struct shtr_line_list* list,
+ (struct shtr_line_list* list,
const size_t i,
struct shtr_line* line)
{
- const struct line* ln_encoded = NULL;
- size_t n = 0;
+ struct line ln_encoded = LINE_NULL;
+ res_T res = RES_OK;
- if(!list || !line) return RES_BAD_ARG;
- n = darray_line_size_get(&list->lines);
- if(i >=n) return RES_BAD_ARG;
- ln_encoded = darray_line_cdata_get(&list->lines) + i;
- line_decode(ln_encoded, line);
- return RES_OK;
+ if(!list || !line || i >= list->nlines) return RES_BAD_ARG;
+
+ res = cache_get_line(list->cache, i, &ln_encoded);
+
+ if(res != RES_OK) { /* Cache miss */
+ const size_t chunk_id = i / NLINES_PER_CHUNK;
+ const size_t line_id = i % NLINES_PER_CHUNK;
+ struct line lines[NLINES_PER_CHUNK];
+
+ if((res = decompress_zchunk(list, chunk_id, lines)) != RES_OK) goto error;
+ cache_put_chunk(list->cache, chunk_id, lines);
+
+ ln_encoded = lines[line_id];
+ }
+
+ line_decode(&ln_encoded, line);
+
+exit:
+ return res;
+error:
+ goto exit;
}
res_T
@@ -522,26 +796,41 @@ shtr_line_list_write
(const struct shtr_line_list* list,
FILE* stream)
{
- size_t nlines = 0;
+ char* const* blocks = NULL;
+ size_t i=0, n=0;
res_T res = RES_OK;
- if(!list || !stream) {
- res = RES_BAD_ARG;
- goto error;
- }
-
- nlines = darray_line_size_get(&list->lines);
+ if(!list || !stream) { res = RES_BAD_ARG; goto error; }
#define WRITE(Var, Nb) { \
if(fwrite((Var), sizeof(*(Var)), (Nb), stream) != (Nb)) { \
- ERROR(list->shtr, "%s: error writing line list.\n", FUNC_NAME); \
res = RES_IO_ERR; \
+ ERROR(list->shtr, \
+ "%s: error writing line list -- %s\n", FUNC_NAME, res_to_cstr(res)); \
goto error; \
} \
} (void)0
+
+ /* Version management */
WRITE(&SHTR_LINE_LIST_VERSION, 1);
- WRITE(&nlines, 1);
- WRITE(darray_line_cdata_get(&list->lines), nlines);
+
+ /* Number of lines in the list */
+ WRITE(&list->nlines, 1);
+
+ /* Memory descriptor of compressed chunks */
+ n = darray_zchunk_size_get(&list->zchunks);
+ WRITE(&n, 1);
+ WRITE(darray_zchunk_cdata_get(&list->zchunks), n);
+
+ /* Compressed data stored in memory blocks */
+ blocks = darray_charp_cdata_get(&list->blocks);
+ n = darray_charp_size_get(&list->blocks);
+ WRITE(&n, 1);
+ FOR_EACH(i, 0, n) { WRITE(blocks[i], BLOCK_SIZE); }
+
+ /* Informations on line parameters */
+ WRITE(&list->info, 1);
+
#undef WRITE
exit:
diff --git a/src/shtr_line_list_c.h b/src/shtr_line_list_c.h
@@ -19,12 +19,78 @@
#ifndef SHTR_LINE_LIST_C_H
#define SHTR_LINE_LIST_C_H
+#include "shtr.h"
+
#include <rsys/dynamic_array.h>
#include <rsys/ref_count.h>
-struct shtr;
+#include <zlib.h>
+
+/*
+ * Brief summary of the design
+ *
+ * Since the number of lines can be very large, one of the challenges is to
+ * reduce the memory footprint. Several line parameters are therefore encoded
+ * with reduced precision (see “struct line”).
+ *
+ * In addition, lines are first stored in a chunk of CHUNK_SIZE bytes which,
+ * once filled, is compressed using zlib. The compressed data is then stored in
+ * a fixed-size memory block. Each chunk can be accessed and decompressed
+ * separately. The memory descriptors for the compressed chunks, i.e., the data
+ * defining the location where the compressed data is stored in the memory
+ * blocks (see “struct zchunk”), are stored in a dynamic array in the order in
+ * which the lines are loaded. Similarly, the memory blocks that contain the
+ * compressed data are also listed in a dynamic array in the order in which the
+ * lines are read. Therefore, the index of a line is sufficient to index the
+ * chunk in which the line is stored, and finally the memory block in which its
+ * parameters are compressed.
+ *
+ * The use of dynamic arrays allows this access by simple indexing, but at the
+ * cost of memory overhead due to the dynamic array allocation policy (up to
+ * twice the required size in the worst case). However, the memory space in
+ * question here is not a major problem, since the zchunk structure and the
+ * pointer to a memory block take up very little space. And while linked lists
+ * could have been used instead, they would not only have complicated data
+ * access, with pointers replacing indexing, but they would also have
+ * complicated data [de]serialization, precisely because of the use of pointers
+ * instead of indexes.
+ *
+ * Note that the use of memory blocks stored in dynamic arrays rather than a
+ * simple dynamic array of contiguous bytes is motivated by the issue of
+ * additional memory overhead associated with the use of dynamic arrays. As said
+ * above, on the worst case, the memory overhead here is equal to twice the
+ * number of blocks multiplied by the size of a pointer, compared to twice the
+ * size required to store all the lines.
+ *
+ * A cache is ultimately used to speed up access to lines, which must now be
+ * decompressed. This cache stores the decompressed blocks in which the most
+ * recently accessed lines are stored. The implementation of this cache is
+ * independent of the line storage. However, it must be thread-safe to allow
+ * simultaneous access.
+ */
+
+/* Size in bytes of a memory block in which compressed data is stored */
+#define BLOCK_SIZE (1024*1024)
+
+/* Size in bytes of an uncompressed chunk */
+#define CHUNK_SIZE (128*1024)
+
+/* Number of lines in a chunk */
+#define NLINES_PER_CHUNK (CHUNK_SIZE/sizeof(struct line))
-#define SET_GAMMA_AIR
+/* Memory descriptor of a compressed chunk */
+struct zchunk {
+ /* Offset to chunk data. The offset is indicated as if the compressed data
+ * were stored sequentially. However, the data is stored in memory blocks of
+ * fixed size. The offset therefore defines both the block index
+ * (offset/BLOCK_SIZE) and the offset within the block (offset%BLOCK_SIZE) */
+ size_t offset;
+
+ /* Size in bytes of the compressed chunk */
+ uint32_t size;
+};
+#define ZCHUNK_NULL__ {0}
+static const struct zchunk ZCHUNK_NULL = ZCHUNK_NULL__;
struct line {
double wavenumber; /* Central wavenumber in vacuum [cm^-1] */
@@ -54,24 +120,40 @@ struct line {
#define LINE_NULL__ {0}
static const struct line LINE_NULL = LINE_NULL__;
-/* Generate the dynamic array of lines */
-#define DARRAY_NAME line
-#define DARRAY_DATA struct line
+STATIC_ASSERT(sizeof(struct line)==32, Unexpected_sizeof_struct_line);
+
+/* Generate the dynamic array of zchunk */
+#define DARRAY_NAME zchunk
+#define DARRAY_DATA struct zchunk
#include <rsys/dynamic_array.h>
-STATIC_ASSERT(sizeof(struct line)==32, Unexpected_sizeof_struct_line);
+/* Generate he dynamic array of char* */
+#define DARRAY_NAME charp
+#define DARRAY_DATA char*
+#include <rsys/dynamic_array.h>
/* Version of the line list. One should increment it and perform a version
* management onto serialized data when the line list structure is updated. */
-static const int SHTR_LINE_LIST_VERSION = 1;
+static const int SHTR_LINE_LIST_VERSION = 2;
+
+/* Forward declaration */
+struct cache;
struct shtr_line_list {
- /* Lines sorted in ascending order wrt their wavenumber */
- struct darray_line lines;
+ /* Compressed lines sorted in ascending order wrt their wavenumber */
+ struct darray_zchunk zchunks; /* Accessor to compressed lines */
+ struct darray_charp blocks; /* Memory where compressed lines are stored */
+ size_t nlines; /* Number of lines */
/* Informations on line parameters */
struct shtr_line_list_info info;
+ struct cache* cache;
+
+ /* zlib */
+ z_stream z_stream;
+ int zlib_is_init;
+
struct shtr* shtr;
ref_T ref;
};
diff --git a/src/test_shtr_lines.c b/src/test_shtr_lines.c
@@ -303,8 +303,8 @@ test_load_failures(struct shtr* shtr)
static void
check_line_list_equality
- (const struct shtr_line_list* list1,
- const struct shtr_line_list* list2)
+ (struct shtr_line_list* list1,
+ struct shtr_line_list* list2)
{
size_t n1, n2;
size_t iline, nlines;