Skip to content

Commit

Permalink
avoid clown shoes allocation of input files
Browse files Browse the repository at this point in the history
Scanner expects its input to be nul terminated, which I implemented as
  buf = std::fs::read(file)
  buf.push(0)

However, std::fs::read carefully sizes its buffer to be exactly the file's size,
which means the push() must grow the buffer.  This means for e.g. a 40mb input
file, we'd read it into a 40mb buffer, then grow the buffer, copying it into an
80mb buffer, just to add one byte!

Fix this by using a buffer of the appropriate size.

I attempted to measure the perf impact here and it wasn't measureable.
Possibly this buffer growth really is that fast?  It seems it will at least
have memory impact at least.
  • Loading branch information
evmar committed Jan 19, 2024
1 parent 909ac60 commit 38899de
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 11 deletions.
7 changes: 3 additions & 4 deletions benches/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ fn bench_parse_synthetic(c: &mut Criterion) {
});
}

fn bench_parse_file(c: &mut Criterion, mut input: Vec<u8>) {
input.push(0);
fn bench_parse_file(c: &mut Criterion, input: &[u8]) {
c.bench_function("parse benches/build.ninja", |b| {
b.iter(|| {
let mut parser = n2::parse::Parser::new(&input);
Expand All @@ -67,8 +66,8 @@ fn bench_parse_file(c: &mut Criterion, mut input: Vec<u8>) {
}

pub fn bench_parse(c: &mut Criterion) {
match std::fs::read("benches/build.ninja") {
Ok(input) => bench_parse_file(c, input),
match n2::scanner::read_file_with_nul("benches/build.ninja".as_ref()) {
Ok(input) => bench_parse_file(c, &input),
Err(err) => {
eprintln!("failed to read benches/build.ninja: {}", err);
eprintln!("using synthetic build.ninja");
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ mod process_posix;
mod process_win;
mod progress;
pub mod run;
mod scanner;
pub mod scanner;
mod signal;
mod smallmap;
mod task;
Expand Down
4 changes: 2 additions & 2 deletions src/load.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::{
eval::{EvalPart, EvalString},
graph::{FileId, RspFile},
parse::Statement,
scanner,
smallmap::SmallMap,
{db, eval, graph, parse, trace},
};
Expand Down Expand Up @@ -172,11 +173,10 @@ impl Loader {

fn read_file(&mut self, id: FileId) -> anyhow::Result<()> {
let path = self.graph.file(id).path().to_path_buf();
let mut bytes = match trace::scope("fs::read", || std::fs::read(&path)) {
let bytes = match trace::scope("read file", || scanner::read_file_with_nul(&path)) {
Ok(b) => b,
Err(e) => bail!("read {}: {}", path.display(), e),
};
bytes.push(0);
self.parse(path, &bytes)
}

Expand Down
20 changes: 19 additions & 1 deletion src/scanner.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Scans an input string (source file) character by character.
use std::path::Path;
use std::{io::Read, path::Path};

#[derive(Debug)]
pub struct ParseError {
Expand Down Expand Up @@ -132,3 +132,21 @@ impl<'a> Scanner<'a> {
panic!("invalid offset when formatting error")
}
}

/// Scanner wants its input buffer to end in a trailing nul.
/// This function is like std::fs::read() but appends a nul, efficiently.
pub fn read_file_with_nul(path: &Path) -> std::io::Result<Vec<u8>> {
// Using std::fs::read() to read the file and then pushing a nul on the end
// causes us to allocate a buffer the size of the file, then grow it to push
// the nul, copying the entire file(!). So instead create a buffer of the
// right size up front.
let mut file = std::fs::File::open(path)?;
let size = file.metadata()?.len() as usize;
let mut bytes = Vec::with_capacity(size + 1);
unsafe {
bytes.set_len(size);
}
file.read_exact(&mut bytes[..size])?;
bytes.push(0);
Ok(bytes)
}
5 changes: 2 additions & 3 deletions src/task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{
depfile,
graph::{Build, BuildId, RspFile},
process,
scanner::Scanner,
scanner::{self, Scanner},
};
use anyhow::{anyhow, bail};
use std::path::{Path, PathBuf};
Expand All @@ -38,15 +38,14 @@ pub struct TaskResult {

/// Reads dependencies from a .d file path.
fn read_depfile(path: &Path) -> anyhow::Result<Vec<String>> {
let mut bytes = match std::fs::read(path) {
let bytes = match scanner::read_file_with_nul(path) {
Ok(b) => b,
// See discussion of missing depfiles in #80.
// TODO(#99): warn or error in this circumstance?
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
Err(e) => bail!("read {}: {}", path.display(), e),
};

bytes.push(0);
let mut scanner = Scanner::new(&bytes);
let parsed_deps = depfile::parse(&mut scanner)
.map_err(|err| anyhow!(scanner.format_parse_error(path, err)))?;
Expand Down

0 comments on commit 38899de

Please sign in to comment.