1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
//! `stringsext` searches for multi-byte encoded strings in binary data.\

//! `stringsext` is a Unicode enhancement of the GNU strings tool with
//! additional functionalities: stringsext recognizes Cyrillic, CJKV characters
//! and other scripts in all supported multi-byte-encodings, while GNU strings
//! fails in finding any of these scripts in UTF-16 and many other encodings.\

//! The role of the main-module is to launch the processing of the input stream in
//! batches with threads. It also receives, merges, sorts and prints the results.

//!  # Operating principle

//!  1. The iterator `input::Slicer` concatenates the input-files and cuts
//!  the input stream into slices called `main::slice`.
//!
//!  2. In `main::run()` these slices are feed in parallel to threads, where each has
//!  its own `Mission` configuration.
//!
//!  3. Each thread runs a search in `main::slice` == `scanner::input_buffer`. The
//!  search is performed by `scanner::FindingCollection::scan()`, which cuts the `scanner::input_buffer`
//!  into smaller chunks of size 2*`output_line_char_nb_max` bytes hereafter called
//! `input_window`.
//!
//!  4. The `Decoder` runs through the `input_window`, searches for valid strings and
//!  decodes them into UTF-8-chunks.
//!
//!  5. Each UTF-8-chunk is then fed into the filter `helper::SplitStr` to be
//!  analyzed if parts of it satisfy certain filter conditions.
//!
//!  6. Doing so, the `helper::SplitStr` cuts the UTF-8-chunk into even smaller
//!  `SplitStr`-chunks not longer than `output_line_char_nb_max` and sends them back to the
//!  `scanner::FindingCollection::scan()` loop.
//!
//!  7. There the `SplitStr`-chunk is packed into a `finding::Finding` object and
//!  then successively added to a `finding::FindingCollection`.
//!
//!  8. After finishing its run through the `input_window` the search continues with
//!  the next `input_window. Goto 5.
//!
//!  9. When all `input_window` s are processed, `scanner::FindingCollection::scan()` returns the
//!  `finding::FindingCollection` to `main::run()` and exits.
//!
//!  10. `main::run()` waits for all threads to return their
//!  `finding::FindingCollection` s. Then, all `Findings` s are merged,
//!  sorted and finally print out by `finding::print()`.
//!
//!  11. While the print still running, the next `main::slice` ==
//!  `scanner::input_buffer` is sent to all threads for the next search.
//!  Goto 3.
//!
//!  12. `main::run()` exits when all `main::slice` s are processed.

extern crate encoding_rs;

mod finding;
mod finding_collection;
mod help;
mod helper;
mod input;
mod mission;
mod options;
mod scanner;

use crate::finding::OUTPUT_LINE_METADATA_LEN;
use crate::finding_collection::FindingCollection;
use crate::help::help;
use crate::input::Slicer;
use crate::mission::MISSIONS;
use crate::options::ARGS;
use crate::scanner::ScannerStates;
use itertools::kmerge;
use scoped_threadpool::Pool;
use std::fs::File;
use std::io;
use std::io::LineWriter;
use std::io::Write;
use std::path::Path;
use std::pin::Pin;
use std::process;
use std::str;
use std::sync::mpsc;
use std::thread;
use std::thread::JoinHandle;

/// Uses the version-number defined in `../Cargo.toml`.
const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION");
/// (c) Jens Getreu
const AUTHOR: &str = "(c) Jens Getreu, 2016-2020";

/// Processes the input stream in batches with threads. Then receives, merges, sorts and
/// prints the result

fn run() -> Result<(), anyhow::Error> {
    let merger: JoinHandle<_>;
    // Scope for threads
    {
        let n_threads = MISSIONS.len();
        let (tx, rx) = mpsc::sync_channel(n_threads);
        //
        // Receiver thread:

        // Receive `FindingCollection`s from scanner threads.
        merger = thread::spawn(move || {
            // Set up output channel.
            let mut output = match ARGS.output {
                Some(ref fname) => {
                    let f = File::create(&Path::new(fname))?;
                    // There is at least one `Mission` in `MISSIONS`.
                    let output_line_len =
                        2 * MISSIONS[0].output_line_char_nb_max + OUTPUT_LINE_METADATA_LEN;
                    let f = LineWriter::with_capacity(output_line_len, f);
                    Box::new(f) as Box<dyn Write>
                }
                None => Box::new(io::stdout()) as Box<dyn Write>,
            };
            output.write_all("\u{feff}".as_bytes())?;

            'batch_receiver: loop {
                // collect
                let mut results: Vec<Pin<Box<FindingCollection>>> = Vec::with_capacity(n_threads);
                for _ in 0..n_threads {
                    results.push(match rx.recv() {
                        // It would be safe to unpin here, as only read operations on data follow:
                        //       Ok(fc) => unsafe { *Pin::into_inner_unchecked(fc) },
                        // Instead, we implement `IntoIterator` for the pinned `Pin<Box<FindingCollection>>` type,
                        // allowing us to `kmerge` a vector of type `Vec<Pin<Box<FindingCollection>>>`.
                        // In this way no unsafe is needed.
                        Ok(fc) => fc,
                        _ => break 'batch_receiver,
                    });
                }
                // merge
                for finding in kmerge(&results) {
                    finding.print(&mut output)?;
                }
            }
            //println!("Merger terminated.");
            output.write_all(&[b'\n'])?;
            output.flush()?;
            Ok(())
        });

        //
        // Sender threads:

        // Setting up the data slice producer.
        let input = Slicer::new();

        // We set up the processor.
        let mut sss = ScannerStates::new(&MISSIONS);
        let mut pool = Pool::new(MISSIONS.len() as u32);

        for (slice, input_file_id, is_last_input_buffer) in input {
            pool.scoped(|scope| {
                for mut ss in sss.v.iter_mut() {
                    let tx = tx.clone();
                    scope.execute(move || {
                        let fc = FindingCollection::from(
                            &mut ss,
                            input_file_id,
                            slice,
                            is_last_input_buffer,
                        );
                        // Send the result to the receiver thread.
                        tx.send(fc).expect(
                            "Error: Can not sent result through output channel. \
                             Write permissions? Is there enough space? ",
                        );
                    });
                }
            });
        }
    } // `tx` drops here, which breaks the `batch_receiver`-loop.

    // If everything goes well, we get `()` here.
    merger.join().unwrap()

    // All threads terminated.
}

/// Application entry point.
fn main() {
    help();

    if let Err(e) = run() {
        eprintln!("Error: `{:?}`.", e);
        process::exit(1);
    }
}

#[cfg(test)]
mod tests {
    use crate::finding::Precision;
    use crate::finding_collection::FindingCollection;
    use crate::mission::Missions;
    use crate::options::{Args, Radix};
    use crate::scanner::ScannerState;
    use itertools::Itertools;
    use lazy_static::lazy_static;
    use std::path::PathBuf;

    lazy_static! {
        pub static ref ARGS: Args = Args {
            inputs: vec![PathBuf::from("myfile.txt")],
            debug_option: false,
            encoding: vec!["ascii".to_string(), "utf-8".to_string()],
            list_encodings: false,
            version: false,
            chars_min: Some("5".to_string()),
            same_unicode_block: true,
            grep_char: None,
            radix: Some(Radix::X),
            output: None,
            output_line_len: Some("30".to_string()),
            no_metadata: false,
            counter_offset: Some("5000".to_string()),
            ascii_filter: None,
            unicode_block_filter: None,
        };
    }

    lazy_static! {
        pub static ref MISSIONS: Missions = Missions::new(
            ARGS.counter_offset.as_ref(),
            &ARGS.encoding,
            ARGS.chars_min.as_ref(),
            ARGS.same_unicode_block,
            ARGS.ascii_filter.as_ref(),
            ARGS.unicode_block_filter.as_ref(),
            ARGS.grep_char.as_ref(),
            ARGS.output_line_len.as_ref(),
        )
        .unwrap();
    }

    /// Tests the concurrent scanning with 2 threads, while one thread merges and prints.
    #[test]
    fn test_merger() {
        use std::pin::Pin;

        let inp = "abcdefgÜhijklmn€opÜqrstuvwÜxyz".as_bytes();

        let missions = &MISSIONS;
        //println!("{:#?}", *MISSIONS);

        let mut ss0 = ScannerState::new(&missions.v[0]);
        let mut ss1 = ScannerState::new(&missions.v[1]);

        let mut resv: Vec<Pin<Box<FindingCollection>>> = Vec::new();
        let fc = FindingCollection::from(&mut ss0, Some(0), inp, true);
        resv.push(fc);
        let fc = FindingCollection::from(&mut ss1, Some(0), inp, true);
        resv.push(fc);

        //println!("{:#?}", resv);

        assert_eq!(resv.len(), 2);
        assert_eq!(resv[0].v.len(), 3);
        assert_eq!(resv[0].v[0].s, "abcdefg");
        assert_eq!(resv[0].v[1].s, "hijklmn");
        assert_eq!(resv[0].v[2].s, "qrstuvw");
        assert_eq!(resv[1].v.len(), 2);
        assert_eq!(resv[1].v[0].s, "abcdefgÜhijklmn");
        assert_eq!(resv[1].v[1].s, "opÜqrstuvwÜxyz");

        // Merge the results.

        let mut iter = resv.iter().kmerge();
        // for res in iter {
        //     println!("Result {:#?}", res);
        // };

        // After merging and sorting the order is deterministic.
        // See implementation of `PartialOrd` for `Finding` for more
        // details.

        let f = iter.next().unwrap();
        assert_eq!(f.s, "abcdefg");
        assert_eq!(f.position, 5000);
        assert_eq!(f.position_precision, Precision::Exact);
        assert_eq!(f.mission.mission_id, 0);

        let f = iter.next().unwrap();
        assert_eq!(f.s, "hijklmn");
        assert_eq!(f.position, 5000);
        assert_eq!(f.position_precision, Precision::After);
        assert_eq!(f.mission.mission_id, 0);

        let f = iter.next().unwrap();
        assert_eq!(f.s, "qrstuvw");
        assert_eq!(f.position, 5000);
        assert_eq!(f.position_precision, Precision::After);
        assert_eq!(f.mission.mission_id, 0);

        let f = iter.next().unwrap();
        assert_eq!(f.s, "abcdefgÜhijklmn");
        assert_eq!(f.position, 5000);
        assert_eq!(f.position_precision, Precision::Exact);
        assert_eq!(f.mission.mission_id, 1);

        let f = iter.next().unwrap();
        assert_eq!(f.s, "opÜqrstuvwÜxyz");
        assert_eq!(f.position, 5000);
        assert_eq!(f.position_precision, Precision::After);
        assert_eq!(f.mission.mission_id, 1);

        let f = iter.next();
        assert_eq!(f, None);
    }
}