rust 读取非uft-8文件line-by-line
rust std::io::BufRead
提供了lines方法。但本身只支持UTF_8
编码。遇到非utf-8编码时直接报错。本文通过使用第三方cratesencoding
增加对编码的支持。 cargo add encoding
code
use std::io::preclude::*;
use encoding::{all::GB18030,DecoderTrap};
//“这是一个字符串\n将会被读取”
let bytes: &[u8] = &[
213, 226, 202, 199, 210, 187, 184, 246, 215, 214, 183, 251, 180, 174, 10, 189, 171,
187, 225, 177, 187, 182, 193, 200, 161,
];
let mut buf: Vec<u8> = Vec::new();
loop {
match bytes.read_until(b'\n', &mut buf) {
Ok(0) => break,
Ok(_) => {
let mut line = String::new();
GB18030
.decode_to(buf, DecoderTrap::Replace, &mut line)
.expect("decode fail");
println!("{}", line);//解码好的字符串
}
Err(e) => panic!(e),
}
}
封装
私基于上文原理对std::io::BufReader
进行了封装,重载了read_line
和lines
方法。crates
usage
cargo add encodingbufreader
use encodingbufreader::{BufReaderEncoding};
use encoding::all::,GB18030;
let bytes: &[u8] = &[
213, 226, 202, 199, 210, 187, 184, 246, 215, 214, 183, 251, 180, 174, 10, 189, 171,
187, 225, 177, 187, 182, 193, 200, 161,
];
for line in BufReaderEncoding::new(bytes, GB18030)
.lines()
.map(|l| l.unwrap()){
println!("{}",line);
}
读取文件
use encodingbufreader::{BufReaderEncoding};
use encoding::all::,GB18030;
use std::fs::File;
let file = File::open("test.txt")?;
for line in BufReaderEncoding::new(file, GB18030)
.lines(){
println!("{}",line?);
}