在日常编程中,我们可能需要处理以分隔符(如逗号、空格或制表符)分割的文本数据,例如 CSV 文件、几何数据文件等,每一行可能有多个整数或浮点数数据。 为各种不同的数据结构写专门的解析器会非常繁琐,我们可以利用模板编程中的递归模板和元组来实现通用的数据读写:

  • 读取 / 写入任意类型字段组合的数据记录;
  • 支持自定义分隔符;
  • 支持忽略注释和多余空格。

实现代码:data_record_io.hpp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#pragma once

#include <algorithm>
#include <cctype>
#include <iostream>
#include <sstream>
#include <string>
#include <tuple>
#include <vector>

template <typename... Ts>
class DataRecordIO {
public:
using Record = std::tuple<Ts...>;

static std::vector<Record> read(std::istream &is, char delimiter) {
std::vector<Record> results;
std::string line;
while (std::getline(is, line)) {
clean_line(line);
if (!line.empty()) {
results.emplace_back(parse_record(line, delimiter));
}
}
return results;
}

static void write(std::ostream &os, const std::vector<Record> &data,
char delimiter) {
for (const auto &record : data) { write_record(os, record, delimiter); }
}

private:
static void clean_line(std::string &line) {
// Replace CR/LF/TAB with space
std::ranges::replace(line, '\r', ' ');
std::ranges::replace(line, '\t', ' ');

// Trim head
line.erase(0, line.find_first_not_of(' '));

// Trim tail
auto pos = line.find_last_not_of(' ');
if (pos != std::string::npos) { line.erase(pos + 1); }

// Remove inline comment '#'
if (auto idx = line.find('#'); idx != std::string::npos) {
line.erase(idx);
}

// Final trim again
line.erase(0, line.find_first_not_of(' '));
pos = line.find_last_not_of(' ');
if (pos != std::string::npos) { line.erase(pos + 1); }
}

static Record parse_record(const std::string &s, char delimiter) {
std::istringstream ss(s);
Record r;
read_tuple(ss, r, delimiter);
return r;
}

template <std::size_t I = 0, typename... Args>
static void read_tuple(std::istringstream &ss, std::tuple<Args...> &t,
char delimiter) {
if constexpr (I < sizeof...(Args)) {
if (!(ss >> std::get<I>(t))) {
throw std::runtime_error("Failed to parse field at index "
+ std::to_string(I));
}

if constexpr (I + 1 < sizeof...(Args)) {
if (delimiter != ' ') {
char sep{0};
if (!(ss >> sep) || sep != delimiter) {
throw std::runtime_error(
"Missing delimiter '" + std::string(1, delimiter)
+ "' after field index " + std::to_string(I));
}
}
}

read_tuple<I + 1, Args...>(ss, t, delimiter);
}
}

template <std::size_t I = 0, typename... Args>
static void write_tuple(std::ostream &os, const std::tuple<Args...> &t,
char delimiter) {
if constexpr (I < sizeof...(Args)) {
os << std::get<I>(t);

if constexpr (I + 1 < sizeof...(Args)) { os << delimiter; }
else { os << '\n'; }

write_tuple<I + 1, Args...>(os, t, delimiter);
}
}

static void write_record(std::ostream &os, const Record &r,
char delimiter) {
write_tuple(os, r, delimiter);
}
};

使用示例 test.cpp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include "data_record_io.hpp"

#include <fstream>

int main() {
{
// 读取 Nodes.txt
auto fin = std::fstream("Nodes.csv", std::ios::in);
auto nodes = DataRecordIO<int, double, double>::read(fin, ',');

// 写入 tmp_Nodes.csv
auto fout = std::fstream("tmp_Nodes.csv", std::ios::out);
DataRecordIO<int, double, double>::write(fout, nodes, ' ');
}

{
// 读取 Triangles.txt(有格式错误)
auto fin = std::fstream("Triangles.txt", std::ios::in);
auto triangles =
DataRecordIO<int, int, int, int, double>::read(fin, ' ');

// 写入 tmp_Triangles.txt(剔除了格式错误)
auto fout = std::fstream("tmp_Triangles.txt", std::ios::out);
DataRecordIO<int, int, int, int, double>::write(fout, triangles, ',');
}
return 0;
}

下面附上两个测试文件。

Nodes.csv,每一行包括一个整数和两个浮点数,使用 , 分割

Nodes.csv
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
1, 0, 0,
2, 5, 0,
3, 5, 4
4, 0, 4
5, 0, 2
6, 1, 1
7, 2, 1
8, 2, 2
9, 1, 2
10, 1, 1.5
11, 3, 1
12, 4, 1
13, 4, 2
14, 3, 2
15, 1, 0
16, 2, 0
17, 3, 0
18, 4, 0
19, 5, 1
20, 5, 2
21, 5, 3
22, 4, 4
23, 3, 4
24, 2, 4
25, 1, 4
26, 0, 3
27, 0, 1
28, 1.5, 1
29, 2, 1.5
30, 1.5, 2
31, 3.5, 1
32, 4, 1.5
33, 3.5, 2
34, 3, 1.5
35, 1.20086, 2.56496
36, 3.27009, 2.4683
37, 2.7188, 2.46564
38, 2.45879, 1.74272
39, 3.00692, 2.97656
40, 3.66045, 2.91631
41, 2.27308, 3.00789
42, 1.49358, 3.20562
43, 2.50926, 1.24661
44, 2.50132, 0.625798
45, 0.584231, 2.42381
46, 1.78296, 2.53335
47, 3.74502, 2.36867
48, 4.30866, 2.45545
49, 0.737543, 3.22788
50, 0.569472, 0.563776
51, 4.36384, 0.636165
52, 4.24932, 3.26512
53, 4.43301, 1.25
54, 4.34833, 1.84109
55, 1.25, 0.5
56, 1.82162, 0.608539
57, 0.525203, 1.69563
58, 0.618935, 1.15188
59, 3.29189, 0.532131
60, 3.83114, 0.633659
61, 2.24321, 2.42062
62, 3.41178, 3.45409
63, 2.69738, 3.48599
64, 2.48416, 2.1258

Triangles.txt,每一行包括四个整数和一个浮点数,使用空格分割,含有注释、空行以及其它不规范的格式细节

Triangles.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# rr

1 41 42 46 3.7
2 29 7 43 3.7 3.e
3 34 14 38 3.7 s
4 42 35 46 3.7 ;;
5 11 44 59 3.7 #e
6 35 42 49 3.7 89
# here
7 34 38 43 3.7
8 38 29 43 3.7
9 41 24 42 3.7
10 41 46 61 3.7
11 45 35 49 3.7
12 59 18 60 3.7
13 57 27 58 3.7
14 53 20 54 3.7
15 38 14 64 3.7
16 40 52 62 3.7
17 8 38 64 3.7
18 16 17 44 3.7
19 24 41 63 3.7
20 43 7 44 3.7
21 3 22 52 3.7
22 12 51 53 3.7
23 51 12 60 3.7
24 50 6 58 3.7
25 6 50 55 3.7
26 44 17 59 3.7
27 26 5 45 3.7
28 44 7 56 3.7
29 20 21 48 3.7
30 11 43 44 3.7
31 61 8 64 3.7
32 40 47 48 3.7
33 25 4 49 3.7
34 8 29 38 3.7
35 11 34 43 3.7
36 52 22 62 3.7
37 4 26 49 3.7
38 21 3 52 3.7
39 55 16 56 3.7
40 24 25 42 3.7
41 17 18 59 3.7
42 5 27 57 3.7
43 19 20 53 3.7
44 48 13 54 3.7
45 9 45 57 3.7
46 16 44 56 3.7
47 15 16 55 3.7
48 27 1 50 3.7
49 18 2 51 3.7
50 2 19 51 3.7
51 1 15 50 3.7
52 26 45 49 3.7
53 9 30 35 3.7
54 30 8 46 3.7
55 14 33 36 3.7
56 40 48 52 3.7
57 28 6 55 3.7
58 45 5 57 3.7
59 33 13 47 3.7
60 20 48 54 3.7
61 14 36 37 3.7
62 23 24 63 3.7
63 37 39 41 3.7
64 42 25 49 3.7
65 39 36 40 3.7
66 22 23 62 3.7
67 37 41 61 3.7
68 48 21 52 3.7
69 37 36 39 3.7
70 9 35 45 3.7
71 6 10 58 3.7
72 10 9 57 3.7
73 7 28 56 3.7
74 32 12 53 3.7
75 13 32 54 3.7
76 35 30 46 3.7
77 31 11 59 3.7
78 12 31 60 3.7
79 47 13 48 3.7
80 40 36 47 3.7
81 36 33 47 3.7
82 46 8 61 3.7
83 51 19 53 3.7
84 27 50 58 3.7
85 18 51 60 3.7
86 14 37 64 3.7
87 50 15 55 3.7
88 37 61 64 3.7
89 28 55 56 3.7
90 39 40 62 3.7
91 41 39 63 3.7
92 10 57 58 3.7
93 32 53 54 3.7
94 31 59 60 3.7
95 39 62 63 3.7
96 62 23 63 3.7