Commit | Line | Data |
---|---|---|
4bcfdbee BA |
1 | #' @name de_serialize |
2 | #' @rdname de_serialize | |
492cd9e7 | 3 | #' @aliases binarize binarizeTransform getDataInFile |
4bcfdbee | 4 | #' |
492cd9e7 | 5 | #' @title (De)Serialization of a [big]matrix or data stream |
4bcfdbee BA |
6 | #' |
7 | #' @description \code{binarize()} serializes a matrix or CSV file with minimal overhead, | |
8 | #' into a binary file. \code{getDataInFile()} achieves the inverse task: she retrieves | |
492cd9e7 BA |
9 | #' (ASCII) data rows from indices in the binary file. Finally, |
10 | #' \code{binarizeTransform()} serialize transformations of all data chunks; to use it, | |
11 | #' a data-retrieval function must be provided, thus \code{binarize} will most likely be | |
12 | #' used first (and then a function defined to seek in generated binary file) | |
4bcfdbee BA |
13 | #' |
14 | #' @param data_ascii Either a matrix or CSV file, with items in rows | |
15 | #' @param indices Indices of the lines to retrieve | |
16 | #' @param data_bin_file Name of binary file on output (\code{binarize}) | |
17 | #' or intput (\code{getDataInFile}) | |
18 | #' @param nb_per_chunk Number of lines to process in one batch | |
19 | #' @inheritParams claws | |
492cd9e7 BA |
20 | #' @param getData Function to retrieve data chunks |
21 | #' @param transform Transformation function to apply on data chunks | |
4bcfdbee BA |
22 | #' |
23 | #' @return For \code{getDataInFile()}, the matrix with rows corresponding to the | |
492cd9e7 BA |
24 | #' requested indices. \code{binarizeTransform} returns the number of processed lines. |
25 | #' \code{binarize} is designed to serialize in several calls, thus returns nothing. | |
4bcfdbee BA |
26 | NULL |
27 | ||
28 | #' @rdname de_serialize | |
29 | #' @export | |
30 | binarize = function(data_ascii, data_bin_file, nb_per_chunk, | |
56857861 BA |
31 | sep=",", nbytes=4, endian=.Platform$endian) |
32 | { | |
33 | if (is.character(data_ascii)) | |
34 | data_ascii = file(data_ascii, open="r") | |
4bcfdbee | 35 | else if (methods::is(data_ascii,"connection") && !isOpen(data_ascii)) |
56857861 | 36 | open(data_ascii) |
492cd9e7 | 37 | is_matrix = !methods::is(data_ascii,"connection") |
56857861 BA |
38 | |
39 | first_write = (!file.exists(data_bin_file) || file.info(data_bin_file)$size == 0) | |
40 | data_bin = file(data_bin_file, open=ifelse(first_write,"wb","ab")) | |
41 | ||
42 | #write data length on first call | |
43 | if (first_write) | |
44 | { | |
45 | #number of items always on 8 bytes | |
4bcfdbee | 46 | writeBin(0L, data_bin, size=8, endian=endian) |
492cd9e7 | 47 | if ( is_matrix ) |
56857861 | 48 | data_length = ncol(data_ascii) |
492cd9e7 | 49 | else #connection |
56857861 BA |
50 | { |
51 | data_line = scan(data_ascii, double(), sep=sep, nlines=1, quiet=TRUE) | |
4bcfdbee | 52 | writeBin(data_line, data_bin, size=nbytes, endian=endian) |
56857861 BA |
53 | data_length = length(data_line) |
54 | } | |
55 | } | |
56 | ||
492cd9e7 | 57 | if (is_matrix) |
56857861 BA |
58 | index = 1 |
59 | repeat | |
60 | { | |
492cd9e7 | 61 | if ( is_matrix ) |
56857861 | 62 | { |
56857861 | 63 | data_chunk = |
492cd9e7 BA |
64 | if (index <= nrow(data_ascii)) |
65 | as.double(t(data_ascii[index:min(nrow(data_ascii),index+nb_per_chunk-1),])) | |
56857861 | 66 | else |
492cd9e7 | 67 | double(0) |
56857861 BA |
68 | index = index + nb_per_chunk |
69 | } | |
70 | else | |
71 | data_chunk = scan(data_ascii, double(), sep=sep, nlines=nb_per_chunk, quiet=TRUE) | |
72 | if (length(data_chunk)==0) | |
73 | break | |
4bcfdbee | 74 | writeBin(data_chunk, data_bin, size=nbytes, endian=endian) |
56857861 BA |
75 | } |
76 | ||
77 | if (first_write) | |
78 | { | |
492cd9e7 | 79 | # Write data_length, = (file_size-1) / (nbytes*nbWritten) at offset 0 in data_bin |
56857861 | 80 | ignored = seek(data_bin, 0) |
4bcfdbee | 81 | writeBin(data_length, data_bin, size=8, endian=endian) |
56857861 BA |
82 | } |
83 | close(data_bin) | |
84 | ||
492cd9e7 | 85 | if ( ! is_matrix ) |
56857861 BA |
86 | close(data_ascii) |
87 | } | |
88 | ||
492cd9e7 BA |
89 | #' @rdname de_serialize |
90 | #' @export | |
91 | binarizeTransform = function(getData, transform, data_bin_file, nb_per_chunk, | |
92 | nbytes=4, endian=.Platform$endian) | |
93 | { | |
94 | nb_items = 0 | |
95 | index = 1 | |
96 | repeat | |
97 | { | |
98 | data_chunk = getData((index-1)+seq_len(nb_per_chunk)) | |
99 | if (is.null(data_chunk)) | |
100 | break | |
101 | transformed_chunk = transform(data_chunk) | |
102 | binarize(transformed_chunk, data_bin_file, nb_per_chunk, ",", nbytes, endian) | |
103 | index = index + nb_per_chunk | |
104 | nb_items = nb_items + nrow(data_chunk) | |
105 | } | |
106 | nb_items | |
107 | } | |
108 | ||
4bcfdbee BA |
109 | #' @rdname de_serialize |
110 | #' @export | |
56857861 BA |
111 | getDataInFile = function(indices, data_bin_file, nbytes=4, endian=.Platform$endian) |
112 | { | |
113 | data_bin = file(data_bin_file, "rb") | |
4bcfdbee BA |
114 | data_size = file.info(data_bin_file)$size |
115 | data_length = readBin(data_bin, "integer", n=1, size=8, endian=endian) | |
56857861 BA |
116 | #Ou t(sapply(...)) (+ rapide ?) |
117 | data_ascii = do.call( rbind, lapply( indices, function(i) { | |
8702eb86 BA |
118 | offset = 8+(i-1)*data_length*nbytes |
119 | if (offset > data_size) | |
120 | return (vector("double",0)) | |
121 | ignored = seek(data_bin, offset) | |
4bcfdbee | 122 | readBin(data_bin, "double", n=data_length, size=nbytes, endian=endian) |
56857861 BA |
123 | } ) ) |
124 | close(data_bin) | |
8702eb86 | 125 | if (ncol(data_ascii)>0) data_ascii else NULL |
56857861 | 126 | } |