1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
//! Store string-findings and prepare them for output.

extern crate encoding_rs;

use crate::input::ByteCounter;
use crate::mission::Mission;
use crate::options::Radix;
use crate::options::ARGS;
use crate::options::ASCII_ENC_LABEL;
use std::io::Write;
use std::str;

/// `OUTPUT_BUF_LEN` needs to be long enough to hold all findings that are
/// decoded to UTF-8 in `scan::scan()`. To estimate the space needed to receive
/// all decodings in UTF-8, the worst case - Asian like `EUC_JP` - has to be
/// taken into consideration: Therefor, in order to avoid output buffer overflow,
/// `OUTPUT_BUF_LEN` should be at least twice as big as `INPUT_BUF_LEN`. You can
/// also check the minimum length with
/// `Decoder::max_utf8_buffer_length_without_replacement`. Unfortunately this can
/// not be done programmatically, because `output_buffer` is a statically
/// allocated array.
#[cfg(not(test))]
pub const OUTPUT_BUF_LEN: usize = 0x9192;
#[cfg(test)]
pub const OUTPUT_BUF_LEN: usize = 0x40;

/// Extra space in bytes for `ByteCounter` and encoding-name when `Finding::print()`
/// prints  a `Finding`.
pub const OUTPUT_LINE_METADATA_LEN: usize = 40;

#[derive(Debug, Eq, PartialEq)]
/// Used to express the precision of `Finding::position` when the algorithm can
/// not determine its exact position.
pub enum Precision {
    /// The finding is located somewhere before `Finding::position`. It is
    /// guarantied, that the finding is not farer than 2*`--output-line-len`
    /// bytes (or the previous finding from the same scanner) away.
    Before,
    /// The algorithm could determine the exact position of the `Finding` at
    /// `Finding::position`.
    Exact,
    /// The finding is located some `[1..2* --output_line_len]` bytes after
    /// `Finding::position` or - in any case - always before the next
    /// `Finding::position`.
    After,
}

/// `Finding` represents a valid result string decoded to UTF-8 with it's
/// original location and its original encoding in the input stream.
#[derive(Debug)]
pub struct Finding<'a> {
    /// A label identifying the origin of the input data: If the origin of the data
    /// is `stdin`: `None`, otherwise: `Some(1)` for input coming from the first
    /// file, `Some(2)` for input from the second file, `Some(3)` for ...
    pub input_file_id: Option<u8>,
    /// `Mission` associated with this finding. We need a reference to the
    /// corresponding `Mission` object here, in order to get additional information,
    /// e.g. the label of the encoding, when we print this `Finding`.
    pub mission: &'static Mission,
    /// The byte number position of this `Finding` in the input stream.
    pub position: ByteCounter,
    /// In some cases the `position` can not be determined exactly. Therefor,
    /// `position_precision` indicates how well the finding is localized. In case
    /// that the position is not exactly known, we indicate if the finding is
    /// somewhere before or after `position`.
    pub position_precision: Precision,
    /// Whatever the original encoding was, the result string `s` is always stored as
    /// UTF-8. `s` is a `&str` pointing into `FindingCollection::output_buffer`.
    pub s: &'a str,
    /// This flag indicates that `s` holds only the second part of a cut finding
    /// from the previous `scanner::scan()` run. This can happen when a finding from
    /// the previous run has hit the`input_buffer`-boundary.
    pub s_completes_previous_s: bool,
}

impl Eq for Finding<'_> {}

/// Useful to compare findings for debugging or testing.
impl PartialEq for Finding<'_> {
    fn eq(&self, other: &Self) -> bool {
        (self.position == other.position)
            && (self.position_precision == other.position_precision)
            && (self.mission.encoding.name() == other.mission.encoding.name())
            && (self.mission.filter == other.mission.filter)
            && (self.s == other.s)
    }
}

/// When `itertools::kmerge()` merges `FindingCollections` into an iterator over
/// `Finding` s, it needs to compare `Finding` s. Therefor, we must implement
/// `PartialOrd`.
impl PartialOrd for Finding<'_> {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        if self.position != other.position {
            self.position.partial_cmp(&other.position)
        } else if self.mission.mission_id != other.mission.mission_id {
            self.mission
                .mission_id
                .partial_cmp(&other.mission.mission_id)
        } else if self.mission.filter.ubf != other.mission.filter.ubf {
            self.mission
                .filter
                .ubf
                .partial_cmp(&other.mission.filter.ubf)
        } else {
            self.mission.filter.af.partial_cmp(&other.mission.filter.af)
        }
    }
}

impl<'a> Finding<'a> {
    pub fn print(&self, out: &mut dyn Write) -> Result<(), Box<std::io::Error>> {
        out.write_all(b"\n")?;
        if !ARGS.no_metadata {
            if ARGS.inputs.len() > 1 {
                if let Some(i) = self.input_file_id {
                    // map 1 -> 'A', 2 -> 'B', 3 -> 'C'
                    out.write_all(&[i + 64_u8, b' '])?;
                }
            };

            if ARGS.radix.is_some() {
                match &self.position_precision {
                    Precision::After => out.write_all(b">")?,
                    Precision::Exact => out.write_all(b" ")?,
                    Precision::Before => out.write_all(b"<")?,
                };
                match ARGS.radix {
                    Some(Radix::X) => out.write_fmt(format_args!("{:0x}", self.position,))?,
                    Some(Radix::D) => out.write_fmt(format_args!("{:0}", self.position,))?,
                    Some(Radix::O) => out.write_fmt(format_args!("{:0o}", self.position,))?,
                    None => {}
                };
                if self.s_completes_previous_s {
                    out.write_all(b"+\t")?
                } else {
                    out.write_all(b" \t")?
                };
            }

            if ARGS.encoding.len() > 1 {
                // map 0 -> 'a', 1 -> 'b', 2 -> 'c' ...
                out.write_all(&[b'(', self.mission.mission_id + 97_u8, b' '])?;
                out.write_all(if self.mission.print_encoding_as_ascii {
                    ASCII_ENC_LABEL.as_bytes()
                } else {
                    self.mission.encoding.name().as_bytes()
                })?;
                // After ")" send two tabs.
                out.write_all(b")\t")?;
            };
        };
        out.write_all(self.s.as_bytes())?;
        Ok(())
    }
}