可以通过查询语句的组合进行检索,VS2015.

main函数,读取存有数据的文件,进行检索。提供两种入口。查词,与按照表达式查询。

1 #include <iostream>
 2 #include <fstream>
 3 
 4 #include "TextSearch.h"
 5 #include "TextQueryI.h"
 6 
 7 
 8 using namespace std;
 9 
10 
11 int main() {
12     
13     ifstream is;
14     is.open("c:/tmp/data.txt", ios::in);
15 
16     // Search word
17     // TextSearch::run(is);
18 
19     // Search word with Query
20     TextQueryI q = TextQueryI("good") | TextQueryI("nice") & TextQueryI("day");
21     TextSearch::run(is, q);
22 
23     return 0;
24 }

封装检索功能的入口

TextSearch.h

1 #pragma once
 2 
 3 #include <fstream>
 4 
 5 #include "TextQueryI.h"
 6 
 7 using std::ifstream;
 8 
 9 // 搜索功能入口
10 class TextSearch
11 {
12 public:
13     TextSearch();
14     ~TextSearch();
15 public:
16     static int run(ifstream &infile);
17     static int run(ifstream &infile, TextQueryI &query);
18 };

TextSearch.cpp

1 #include "TextSearch.h"
 2 
 3 #include <iostream>
 4 #include <string>
 5 
 6 #include "TextQuery.h"
 7 #include "TextQueryI.h"
 8 
 9 using std::cin; using std::cout; using std::endl;
10 using std::string;
11 
12 int TextSearch::run(ifstream & infile)
13 {
14     // 读取、分析数据
15     TextQuery tq(infile);
16 
17     // serach query
18     while (true) {
19         cout << "enter search word, or [q] to quit:" << endl;
20         string s;
21         if (!(cin >> s) || s == "q") break;
22         print(cout, tq.query(s)) << endl;
23     }
24     return 0;
25 }
26 
27 int TextSearch::run(ifstream &infile, TextQueryI &query) {
28     cout << "start process query: ";
29     TextQuery tq(infile);
30     print(cout, query.eval(tq)) << endl;
31     return 0;
32 }

TextQuery.h  文本内容分析,保存原始数据的分词分析结果和行号信息。并提供单个词检索的功能。

1 #pragma once
 2 
 3 #include <fstream>
 4 #include <string>
 5 #include <map>
 6 #include <set>
 7 #include <vector>
 8 #include <memory>
 9 
10 #include <iostream>
11 
12 using std::string;
13 using std::vector;
14 using std::endl;
15 
16 class QueryResult;
17 
18 // 完成分词与检索功能
19 class TextQuery
20 {
21 public:
22     using LineNo = vector<string>::size_type;
23     TextQuery(std::ifstream&);
24     ~TextQuery() {};
25 public:
26     QueryResult query(const string&) const;
27 private:
28     // 保存输入数据,每个元素是一行string
29     std::shared_ptr<vector<string> > data;
30     // 保存分析结果,key为检索词,value是包含检索词的行号
31     std::map<string, std::shared_ptr<std::set<LineNo>> > wm;
32 };
33 
34 class QueryResult
35 {
36     friend std::ostream& print(std::ostream &, const QueryResult &);
37 public:
38     using LineNo = TextQuery::LineNo;
39     QueryResult(std::string word,
40         std::shared_ptr<std::set<LineNo> > pLineNo,
41         std::shared_ptr<std::vector<std::string> > data) :
42         query_word(word), lines(pLineNo), files(data) {};
43     ~QueryResult() {};
44     auto getFiles() { return files; }
45     auto begin() { return lines->begin(); }
46     auto end() { return lines->end(); }
47 private:
48     // 查询词
49     std::string query_word;
50     // 查询词的行号,ordered
51     std::shared_ptr<std::set<LineNo> > lines;
52     // 对原始数据的引用
53     std::shared_ptr<std::vector<std::string> > files;
54 };

TextQuery.cpp

1 #include "TextQuery.h"
 2 
 3 #include <sstream>
 4 
 5 using std::getline;
 6 using std::istringstream;
 7 
 8 TextQuery::TextQuery(std::ifstream &is): data(new vector<std::string>)
 9 {
10     string text;
11     while (getline(is, text)) {
12         // 读取一行并记录数据,方便给出查询结果
13         data->push_back(text);
14         LineNo line_no = data->size() - 1;
15         // 单词分解
16         istringstream line(text);
17         string word;
18         // 单词查询结果记录
19         while (line >> word) {
20             auto &lines = wm[word]; // 获取智能指针,如果map没有会自动创建
21             if (!lines) {
22                 lines.reset(new std::set<LineNo>);  // 配置智能指针的对象
23             }
24             lines->insert(line_no); // 添加行号,如果重复什么都不做
25         }
26     }
27 }
28 
29 QueryResult TextQuery::query(const string & word) const
30 {
31     static std::shared_ptr<std::set<LineNo> > p_no_result(new std::set<LineNo>);
32     auto loc = wm.find(word);
33     if (loc == wm.end()) {
34         return QueryResult(word, p_no_result, data);
35     }
36     else {
37         return QueryResult(word, loc->second, data);
38     }
39 }
40 
41 // 格式化打印结果,类似于提供tostring
42 std::ostream& print(std::ostream &os, const QueryResult &qr) {
43     os << qr.query_word << " occours " << qr.lines->size() <<
44         (qr.lines->size() > 1 ? "times" : "time") << endl;
45     for (auto num : *qr.lines) {
46         os << "at line:" << num + 1 << " > ";
47         os << *(qr.files->begin() + num) << endl;  // 尽量使用迭代器,不使用下标,获得较好的扩展性
48     }
49     return os;
50 }

TextQueryI.h  使用表达式进行查询的接口,用于屏蔽下层。

1 #pragma once
 2 #include <vector>
 3 #include <string>
 4 #include <memory>
 5 
 6 #include "TextQuery.h"
 7 #include "TextQueryBase.h"
 8 #include "TextQueryWordQuery.h"
 9 
10 // TextQuery接口类
11 class TextQueryI {
12     // 有一个私有的构造函数,需要运算符是友元
13     friend TextQueryI operator~(const TextQueryI &);
14     friend TextQueryI operator&(const TextQueryI &, const TextQueryI &);
15     friend TextQueryI operator|(const TextQueryI &, const TextQueryI &);
16 
17 public:
18     using LineNo = std::vector<std::string>::size_type;
19     TextQueryI(const std::string &s): q(new TextQueryWordQuery(s)) {}
20     // 作为TextQueryBase的唯一接口,自己实现对应的方法来屏蔽TextQueryBase的行为
21     QueryResult eval(const TextQuery &tq) const 
22         { return q->eval(tq); }
23     std::string rep() const
24         { return q->rep(); }
25 private:
26     TextQueryI(std::shared_ptr<TextQueryBase> query): q(query) {}
27     std::shared_ptr<TextQueryBase> q;
28 };
29 
30 std::ostream &
31 operator<<(std::ostream &os, const TextQueryI &tq);
32 
33 TextQueryI operator~(const TextQueryI &);
34 TextQueryI operator&(const TextQueryI &, const TextQueryI &);
35 TextQueryI operator|(const TextQueryI &, const TextQueryI &);

TextQueryI.cpp 实现重定向避免重复定义。

1 #include "TextQueryI.h"
2 
3 std::ostream &
4 operator<<(std::ostream &os, const TextQueryI &tq) {
5     return os << tq.rep();
6 }

TextQueryBase.h 利用虚函数实现表达式功能实现的抽象类。

1 #pragma once
 2 
 3 #include <string>
 4 
 5 #include "TextQuery.h"
 6 
 7 class TextQueryBase
 8 {
 9     // 用户不会使用TextQueryBase类,所有使用都通过TextQueryI完成
10     friend class TextQueryI;
11 protected:
12     using LineNo = TextQuery::LineNo;
13     virtual ~TextQueryBase() = default;
14 private:
15     // 执行查询
16     virtual QueryResult eval(const TextQuery &) const = 0;
17     // 获得查询对应的string形式表示,类似toString
18     virtual std::string rep() const = 0;
19 };

TextQueryNot.h  实现非逻辑的对象。完成对非逻辑表达式的string表示、完成对分词结果的Not分析。

1 #pragma once
 2 
 3 #include <memory>
 4 
 5 #include "TextQueryI.h"
 6 #include "TextQueryBase.h"
 7 
 8 class TextQueryNot : public TextQueryBase {
 9     friend TextQueryI operator~(const TextQueryI &);
10 private:
11     TextQueryNot(const TextQueryI &q) : query(q) {}
12     virtual QueryResult eval(const TextQuery &) const override;
13     // 获得查询的string表示?
14     virtual std::string rep() const override {
15         return "~(" + query.rep() + ")";
16     }
17 private:
18     TextQueryI query;
19 };
20 
21 inline TextQueryI operator~(const TextQueryI &operand) {
22     return std::shared_ptr<TextQueryBase>(new TextQueryNot(operand));
23 }

TextQueryNot.cpp 实现非逻辑的代码。完成对分词结果进行非逻辑的加工。

1 #include "TextQueryNot.h"
 2 
 3 QueryResult
 4 TextQueryNot::eval(const TextQuery &tq) const {
 5     auto result = query.eval(tq);
 6     auto ret = std::make_shared<std::set<LineNo> >();
 7     auto beg = result.begin(), end = result.end();
 8     auto sz = result.getFiles()->size();
 9     for (size_t n = 0; n != sz; n++) {
10         // 考察结果中的每一行
11         if (beg == end || *beg != n) {
12             ret->insert(n);
13         }
14         else if (beg != end) {
15             ++beg;
16         }
17     }
18     return QueryResult(rep(), ret, result.getFiles());
19 }

TextQueryBinary.h  二元运算的共同基类,同时定义了And和Or运算

1 #pragma once
 2 
 3 #include "TextQueryI.h"
 4 #include "TextQueryBase.h"
 5 
 6 class TextQueryBinary : public TextQueryBase {
 7 protected:
 8     TextQueryBinary(const TextQueryI &left, const TextQueryI &right, std::string s):
 9         lhs(left), rhs(right), opSymbol(s) {}
10     // 只提供打印方法,实际操作还是虚函数
11     std::string rep() const override {
12         return "(" + lhs.rep() + " " + opSymbol + " " + rhs.rep() + ")";
13     }
14 
15 protected:
16     TextQueryI lhs, rhs;   // 操作对象
17     std::string opSymbol;  // 操作符
18 };
19 
20 class TextQueryAnd : public TextQueryBinary {
21     friend TextQueryI operator&(const TextQueryI &, const TextQueryI &);
22 private:
23     TextQueryAnd(const TextQueryI &lhs, const TextQueryI &rhs): TextQueryBinary(lhs, rhs, "&") {}
24     QueryResult eval(const TextQuery &) const override;
25 };
26 
27 class TextQueryOr : public TextQueryBinary {
28     friend TextQueryI operator|(const TextQueryI &, const TextQueryI &);
29 private:
30     TextQueryOr(const TextQueryI &lhs, const TextQueryI &rhs) : TextQueryBinary(lhs, rhs, "|") {}
31     QueryResult eval(const TextQuery &) const override;
32 };

TextQueryBinary.cpp

1 #include "TextQueryBinary.h"
 2 
 3 #include <set>
 4 #include <algorithm>
 5 #include <iterator>
 6 #include <memory>
 7 
 8 QueryResult
 9 TextQueryOr::eval(const TextQuery &tq) const {
10     auto right = rhs.eval(tq), left = lhs.eval(tq);
11     auto ret = std::make_shared<std::set<LineNo> >(left.begin(), left.end());
12     ret->insert(right.begin(), right.end());
13     return QueryResult(rep(), ret, left.getFiles());
14 }
15 
16 QueryResult
17 TextQueryAnd::eval(const TextQuery &tq) const {
18     auto right = rhs.eval(tq), left = lhs.eval(tq);
19     auto ret = std::make_shared<std::set<LineNo> >();
20     std::set_intersection(left.begin(), left.end(), right.begin(), right.end(), std::inserter(*ret, ret->begin()));
21     ret->insert(right.begin(), right.end());
22     return QueryResult(rep(), ret, left.getFiles());
23 }
24 
25 TextQueryI operator&(const TextQueryI &lhs, const TextQueryI &rhs) {
26     return std::shared_ptr<TextQueryBase>(new TextQueryAnd(lhs, rhs));
27 }
28 
29 TextQueryI operator|(const TextQueryI &lhs, const TextQueryI &rhs) {
30     return std::shared_ptr<TextQueryBase>(new TextQueryOr(lhs, rhs));
31 }

TextQueryWordQuery.h  表达式查询的叶子节点,表示对某个词进行查询,相当于表达式体系中对单个词查询的基础功能调用。

1 #pragma once
 2 
 3 #include <string>
 4 
 5 #include "TextQuery.h"
 6 #include "TextQueryBase.h"
 7 
 8 // 对象树的叶子节点
 9 class TextQueryWordQuery : public TextQueryBase
10 {
11     friend class TextQueryI;
12 private:
13     TextQueryWordQuery(const std::string &s) : query_word(s) {}
14     virtual QueryResult eval(const TextQuery &tq) const override {
15         return tq.query(query_word);
16     }
17     virtual std::string rep() const override {
18         return query_word;
19     }
20 private:
21     std::string query_word;
22 };