2020-04-23-AnyQ阿里云centos7编译&测试&添加数据集

python

Posted by berlinfog on April 23, 2020

AnyQ阿里云centos7编译&测试&添加数据集

配置:
本机Windows10家庭版
服务器阿里云centos7 40G
处理器:2.5 GHz主频的Intel ® Xeon ® Platinum 8163(Skylake) 
存储40G
1.下载Anyq包:
git clone https://github.com/baidu/AnyQ.git
2.安装cmake

安装gcc/g++的软件依赖>4.8.2(已安装请跳过)

yum install -y gcc gcc-c++ make automake

下载cmake源代码包(推荐3.3.2)

wget https://cmake.org/files/v3.3/cmake-3.3.2.tar.gz
tar -zxvf cmake-3.3.2.tar.gz
cd cmake-3.3.2/ 
./bootstrap 
gmake 
gmake install
3.安装bison3.0,查看版本号如下
yum install bison
bison -V
4.jdk安装
yum install java-1.8.0-openjdk-devel
java -version
5.修改github下载体积大小
git config http.postBuffer 64288000
6.修改一堆AnyQ/cmake/external,部分换成gitee仓库加快一丢丢可怜的速度。

第一个paddle.cmake

INCLUDE(ExternalProject)

SET(PADDLE_SOURCES_DIR ${THIRD_PARTY_PATH}/paddle)
SET(PADDLE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle)

ExternalProject_Add(
    extern_paddle
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY       "https://github.com/PaddlePaddle/Paddle.git"
    GIT_TAG   "v0.14.0"#这里
    PREFIX               ${PADDLE_SOURCES_DIR}
    CONFIGURE_COMMAND    mkdir -p ${PADDLE_INSTALL_DIR} && cd ${PADDLE_INSTALL_DIR} && ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PADDLE_INSTALL_DIR} 
                         -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DWITH_MKL=ON -DWITH_MKLDNN=OFF -DWITH_GPU=OFF -DWITH_FLUID_ONLY=ON <SOURCE_DIR>
    BUILD_COMMAND        cd ${PADDLE_INSTALL_DIR} && make -j16#这里可以改成j1
    INSTALL_COMMAND      cd ${PADDLE_INSTALL_DIR} && make inference_lib_dist
    UPDATE_COMMAND       ""
)

add_custom_command(TARGET extern_paddle POST_BUILD
    COMMAND mkdir -p third_party/include/paddle/ third_party/lib
    COMMAND cp -rf ${PADDLE_INSTALL_DIR}/fluid_install_dir/paddle/fluid third_party/include/paddle
    COMMAND cp -rf ${PADDLE_INSTALL_DIR}/fluid_install_dir/paddle/fluid/inference/lib* third_party/lib
    COMMAND cp -rf ${PADDLE_INSTALL_DIR}/fluid_install_dir/third_party/install/mklml/include/* ${THIRD_PARTY_PATH}/include/
    COMMAND cp -rf ${PADDLE_INSTALL_DIR}/fluid_install_dir/third_party/install/mklml/lib/* ${THIRD_PARTY_PATH}/lib/
    COMMAND cp -rf  ${PADDLE_INSTALL_DIR}/fluid_install_dir/third_party/boost ${PADDLE_INSTALL_DIR}/fluid_install_dir/third_party/install/boost_1_41_0
)

第二个xgboost.cmake,主要指定版本

INCLUDE(ExternalProject)

SET(XGBOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/xgboost)
SET(XGBOOST_INSTALL_DIR ${XGBOOST_SOURCES_DIR}/src/xgboost)

ExternalProject_Add(

extern_xgboost

${EXTERNAL_PROJECT_LOG_ARGS}

DOWNLOAD_DIR ${XGBOOST_SOURCES_DIR}/src/

DOWNLOAD_COMMAND git clone --recursive https://github.com/dmlc/xgboost.git && cd xgboost && git checkout v0.81#这里改v
    DOWNLOAD_NO_PROGRESS  1
    PREFIX          ${XGBOOST_SOURCES_DIR}
    BUILD_COMMAND         ""
    UPDATE_COMMAND ""
    CONFIGURE_COMMAND ""
    INSTALL_COMMAND  cd ${XGBOOST_INSTALL_DIR} && make -j1#这里改单线程
    BUILD_IN_SOURCE 1
)

add_custom_command(TARGET extern_xgboost POST_BUILD
    COMMAND mkdir -p third_party/lib/
    COMMAND mkdir -p third_party/include/
    COMMAND cp -r ${XGBOOST_INSTALL_DIR}/include/* third_party/include/
    COMMAND cp -r ${XGBOOST_INSTALL_DIR}/lib/* third_party/lib
    COMMAND cp ${XGBOOST_INSTALL_DIR}/rabit/lib/librabit.a ${XGBOOST_INSTALL_DIR}/dmlc-core/libdmlc.a third_party/lib/
    COMMAND cp -r ${XGBOOST_INSTALL_DIR}/src third_party/
    COMMAND cp -r ${XGBOOST_INSTALL_DIR}/dmlc-core/include/* ${XGBOOST_INSTALL_DIR}/rabit/include/* third_party/include/
)

第三个,brpc.cmake,修改成gitee仓库

INCLUDE(ExternalProject)

SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/)
SET(BRPC_DOWNLOAD_DIR  ${BRPC_SOURCES_DIR}/src/)
#set(ENV{PATH} ${THIRD_PARTY_PATH}/bin:$ENV{PATH} )
ExternalProject_Add(
    extern_brpc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS                extern_protobuf extern_glog extern_gflags extern_leveldb extern_openssl 
    DOWNLOAD_DIR           ${BRPC_DOWNLOAD_DIR}
    DOWNLOAD_COMMAND git   clone https://gitee.com/berlinfog/brpc.git && cd brpc && git checkout v0.9.0
    DOWNLOAD_NO_PROGRESS   1
    PREFIX                 ${BRPC_SOURCES_DIR}
    BUILD_COMMAND          ""
    UPDATE_COMMAND         ""
    CONFIGURE_COMMAND      ""
    INSTALL_COMMAND        cd ${BRPC_DOWNLOAD_DIR}/brpc/ && export PATH=${THIRD_PARTY_PATH}/bin:$ENV{PATH}
                           && bash config_brpc.sh --headers=${BRPC_INSTALL_DIR}/include --libs=${BRPC_INSTALL_DIR}/lib --with-glog
                           && make
    BUILD_IN_SOURCE 1

)

add_custom_command(TARGET extern_brpc POST_BUILD
    COMMAND mkdir -p third_party/lib/
    COMMAND mkdir -p third_party/include/
    COMMAND cp -r ${BRPC_DOWNLOAD_DIR}/brpc/output/* ${BRPC_INSTALL_DIR}/
)

第四个protobuf.cmake换gitee

INCLUDE(ExternalProject)
SET(OPTIONAL_ARGS
    "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
    "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
    "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
     "-Dprotobuf_WITH_ZLIB=ON"
     "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
      ${EXTERNAL_OPTIONAL_ARGS})

SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
SET(PROTOBUF_REPO "https://gitee.com/berlinfog/protobuf.git")
SET(PROTOBUF_TAG "v3.1.0")
IF(USE_TENSORFLOW)
    SET(PROTOBUF_TAG "v3.5.0")
ENDIF()
SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH})

ExternalProject_Add(
    extern_protobuf
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             extern_zlib
    GIT_REPOSITORY      ${PROTOBUF_REPO}
    GIT_TAG             ${PROTOBUF_TAG}
    PREFIX              ${PROTOBUF_SOURCES_DIR}
    CONFIGURE_COMMAND   cd <SOURCE_DIR> && ${CMAKE_COMMAND} -DCMAKE_SKIP_RPATH=ON
                        -Dprotobuf_BUILD_TESTS=OFF
                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                        -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
                        -DCMAKE_INSTALL_LIBDIR=lib ./cmake
    BUILD_COMMAND       cd <SOURCE_DIR> && make -j8 && make install
    UPDATE_COMMAND      ""
    INSTALL_COMMAND     ""
)

add_custom_command(TARGET extern_protobuf POST_BUILD
    COMMAND cp ${PROTOBUF_INSTALL_DIR}/bin/protoc ${PROTOBUF_INSTALL_DIR}/lib
)

7.进入安装包开始编译

以上配置好cmake一般不会报错,但是make报错,出了问题就再make一下就行,成功标志为有提示run_server这个服务生成了:

cd /data/xiaoyichao/projects/AnyQ
mkdir build && cd build
cmake ..
可以用make tee eva.txt保存一下出错在哪里的信息

成功标志:

[100%] Linking CXX executable run_server
[100%] Built target run_server
8.系统编译成功以后

在AnyQ/build目录下,执行下面的命令来获取AnyQ定制solr、AnyQ示例配置,注意要使用python2,这里用的是2.7版本

cp ../tools/anyq_deps.sh .
sh anyq_deps.sh
cp ../tools/solr -rp solr_script
sh solr_script/anyq_solr.sh solr_script/sample_docs
9.开始测试原来的数据集
./run_server
10.搞一个新终端
curl 127.0.0.1:8999/anyq?question=以前西凉是现在什么地方
11.需要换个新的数据集测试的往下看
12.windows往阿里云上传文件:
pscp C:\Users\10750\Desktop\WebQA.v1.0\WebQA.v1.0\train root@39.100.233.32:/dododo/AnyQ
13.导入数据集,格式如:

你好吗? 好。

注意这中间的空格都是\t而不是普通的空格并且最后一行不为空

14.将灌库文件faq_file(utf8编码)转换成Json格式:
cp -rp ../tool/solr ./solr_script
python solr_script/make_json.py train faq/schema_format train_json

接下来把数据编程符合格式的json格式:

awk -F "\t" '{print ++ind"\t"$0}' train_json > train_json.index

索引ID添加完毕以后,即可对已添加完索引ID的文件 train_json.index构建语义索引库

15.接下来对于/build/example/conf中的文件进行修改,目录如下

首先修改dict.conf

name: "example_dict_conf"
dict_config {
    name: "rank_weights"
    type: "String2FloatAdapter"
    path: "./rank_weights"
}
dict_config {
    name: "lac"
    type: "WordsegAdapter"
    path: "./wordseg_utf8"
}
dict_config{
        name: "fluid_simnet"
        type: "PaddleSimAdapter"
        path: "./simnet"
}
dict_config {
        name: "annoy_knowledge_dict"
        type: "String2RetrievalItemAdapter"
        path: "train_json.index"#把测试集合改一下
}

接下来修改一下analysis.conf

name: "analysis_conf"
analysis_method {
    name: "method_wordseg"
    type: "AnalysisWordseg"
    using_dict_name: "lac"
}
analysis_method {
        name: "method_simnet_emb"
        type: "AnalysisSimNetEmb"
        using_dict_name: "fluid_simnet"
        dim: 128
        query_feed_name: "left"
        cand_feed_name: "right"
        embedding_fetch_name: "tanh.tmp"
}

修改一下retrieval.conf

retrieval_plugin {
        name : "semantic_recall"
        type : "SemanticRetrievalPlugin"
        vector_size : 128
        search_k : 10000
        index_path : "./semantic.annoy"
        using_dict_name: "annoy_knowledge_dict"
        num_result : 100#指定数据集的问题数目
}

修改一下rank.conf

name : "test_rank"
top_result: 1
matching_config {
    name : "jaccard_sim"
    type : "JaccardSimilarity"
    output_num : 1
    rough : false
}

# Paddle SimNet匹配模型相似度
matching_config {
    name: "fluid_simnet_feature"
    type: "PaddleSimilarity"
    using_dict_name: "fluid_simnet"
    output_num : 1
    rough : false
    query_feed_name: "left"
    cand_feed_name: "right"
    score_fetch_name: "cos_sim_0.tmp"
}
rank_predictor {
    type: "PredictLinearModel"
    using_dict_name: "rank_weights"
}
threshold : 0.5
16.接下来再为`train.index`构建**语义索引库**(在`AnyQ/build`下执行):
 ./annoy_index_build_tool example/conf/ example/conf/analysis.conf train_json.index 128 10 semantic.annoy 1>std 2>err
17.随后可以开始启动啦
./run_server
18.询问
curl 127.0.0.1:8999/anyq?question=以前西凉是现在什么地方
后记

快速改配置

cd /dododo/AnyQ/build/example/conf

快速运行

cd /dododo/AnyQ/build
./run_server

服务器返回相应结果