阿里巴巴开源异构数据源离线同步工具datax编译安装配置

环境:
[root@node101 DataX]# uname -a
Linux node101 4.4.219-1.el7.elrepo.x86_64 #1 SMP Sun Apr 12 16:13:06 EDT 2020 x86_64 x86_64 x86_64 GNU/Linux
[root@node101 DataX]# cat /etc/redhat-release
CentOS Linux release 7.6.1810 (Core)
[root@node101 DataX]# python
Python 2.7.5 (default, Oct 30 2018, 23:45:53)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-36)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
[root@node101 DataX]# rpm -qa | grep java
java-1.8.0-openjdk-devel-1.8.0.322.b06-1.el7_9.x86_64
python-javapackages-3.4.1-11.el7.noarch
tzdata-java-2021e-1.el7.noarch
javapackages-tools-3.4.1-11.el7.noarch
java-1.8.0-openjdk-headless-1.8.0.322.b06-1.el7_9.x86_64
java-1.8.0-openjdk-1.8.0.322.b06-1.el7_9.x86_64

安装JAVA和mvn包:
yum install java -y
wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo
yum -y install apache-maven


下载源代码:
DataX_source_code_home=/root/
cd /root/
git clone https://github.com/alibaba/DataX.git


通过maven打包:
更换mvn为阿里的源:
[root@node101 maven]# pwd
/etc/maven
[root@node101 maven]# cat settings.xml
在<morrors> </mirrors>中加入以下代码:
/etc/mvn/
<mirror>
  <id>aliyunmaven</id>
  <mirrorOf>*</mirrorOf>
  <name>阿里云公共仓库</name>
  <url>https://maven.aliyun.com/repository/public</url>
</mirror>


$ cd  {DataX_source_code_home}


注释无法编译的组件:
cat pom.xml
 60 <!--        <module>hdfsreader</module> -->
 68         <!-- <module>tsdbreader</module> -->
 81 <!--        <module>hdfswriter</module> -->
 98         <!-- <module>tsdbwriter</module> -->


$ mvn -U clean package assembly:assembly -Dmaven.test.skip=true -X


打包成功,日志显示如下:
[INFO] datax-all .......................................... SUCCESS [04:43 min]
[INFO] datax-common ....................................... SUCCESS [  7.217 s]
[INFO] datax-transformer .................................. SUCCESS [  5.076 s]
[INFO] datax-core ......................................... SUCCESS [  9.603 s]
[INFO] plugin-rdbms-util .................................. SUCCESS [  3.718 s]
[INFO] mysqlreader ........................................ SUCCESS [  2.609 s]
[INFO] drdsreader ......................................... SUCCESS [  3.476 s]
[INFO] sqlserverreader .................................... SUCCESS [  3.511 s]
[INFO] postgresqlreader ................................... SUCCESS [  2.976 s]
[INFO] kingbaseesreader ................................... SUCCESS [  2.635 s]
[INFO] oraclereader ....................................... SUCCESS [  3.067 s]
[INFO] odpsreader ......................................... SUCCESS [  4.741 s]
[INFO] otsreader .......................................... SUCCESS [  4.627 s]
[INFO] otsstreamreader .................................... SUCCESS [  4.596 s]
[INFO] plugin-unstructured-storage-util ................... SUCCESS [  3.321 s]
[INFO] txtfilereader ...................................... SUCCESS [  9.043 s]
[INFO] streamreader ....................................... SUCCESS [  2.204 s]
[INFO] ossreader .......................................... SUCCESS [  9.114 s]
[INFO] ftpreader .......................................... SUCCESS [  8.767 s]
[INFO] mongodbreader ...................................... SUCCESS [  7.671 s]
[INFO] rdbmsreader ........................................ SUCCESS [  2.733 s]
[INFO] hbase11xreader ..................................... SUCCESS [ 13.827 s]
[INFO] hbase094xreader .................................... SUCCESS [  8.680 s]
[INFO] opentsdbreader ..................................... SUCCESS [  6.222 s]
[INFO] cassandrareader .................................... SUCCESS [  4.187 s]
[INFO] gdbreader .......................................... SUCCESS [  9.736 s]
[INFO] oceanbasev10reader ................................. SUCCESS [  3.777 s]
[INFO] mysqlwriter ........................................ SUCCESS [  2.152 s]
[INFO] tdenginewriter ..................................... SUCCESS [  4.274 s]
[INFO] drdswriter ......................................... SUCCESS [  2.347 s]
[INFO] odpswriter ......................................... SUCCESS [  4.852 s]
[INFO] txtfilewriter ...................................... SUCCESS [  7.167 s]
[INFO] ftpwriter .......................................... SUCCESS [  8.369 s]
[INFO] streamwriter ....................................... SUCCESS [  1.996 s]
[INFO] otswriter .......................................... SUCCESS [  4.322 s]
[INFO] oraclewriter ....................................... SUCCESS [  2.136 s]
[INFO] sqlserverwriter .................................... SUCCESS [  2.002 s]
[INFO] postgresqlwriter ................................... SUCCESS [  2.027 s]
[INFO] kingbaseeswriter ................................... SUCCESS [  2.017 s]
[INFO] osswriter .......................................... SUCCESS [  6.950 s]
[INFO] mongodbwriter ...................................... SUCCESS [  6.970 s]
[INFO] adswriter .......................................... SUCCESS [  6.641 s]
[INFO] ocswriter .......................................... SUCCESS [  4.350 s]
[INFO] rdbmswriter ........................................ SUCCESS [  2.312 s]
[INFO] hbase11xwriter ..................................... SUCCESS [ 13.488 s]
[INFO] hbase094xwriter .................................... SUCCESS [  6.986 s]
[INFO] hbase11xsqlwriter .................................. SUCCESS [ 19.160 s]
[INFO] hbase11xsqlreader .................................. SUCCESS [ 19.933 s]
[INFO] elasticsearchwriter ................................ SUCCESS [  3.980 s]
[INFO] adbpgwriter ........................................ SUCCESS [  4.821 s]
[INFO] gdbwriter .......................................... SUCCESS [  8.611 s]
[INFO] cassandrawriter .................................... SUCCESS [  4.164 s]
[INFO] clickhousewriter ................................... SUCCESS [  4.105 s]
[INFO] oscarwriter ........................................ SUCCESS [  2.346 s]
[INFO] oceanbasev10writer ................................. SUCCESS [  4.574 s]
[INFO] hbase20xsqlreader .................................. SUCCESS [  3.002 s]
[INFO] hbase20xsqlwriter .................................. SUCCESS [  2.440 s]
[INFO] kuduwriter ......................................... SUCCESS [  2.997 s]
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 10:03 min
[INFO] Finished at: 2022-03-12T10:44:05+08:00
[INFO] Final Memory: 487M/1083M
[INFO] ------------------------------------------------------------------------




打包成功后的DataX包位于 {DataX_source_code_home}/target/datax/datax/ ,结构如下:
$ cd  {DataX_source_code_home}
[root@node101 datax]# pwd
/root/DataX/target/datax/datax
[root@node101 datax]# ls
bin  conf  job  lib  plugin  script  tmp


将datax文件夹移到/usr/local/目录下:
[root@node101 datax]# pwd
/root/DataX/target/datax
[root@node101 datax]# ls
datax
[root@node101 datax]# cp -r datax /usr/local/


创建命令行链接:
[root@node101 datax]# chmod 755 /usr/local/datax -R
[root@node101 datax]# ln -sf /usr/local/datax/bin/datax.py /usr/bin/datax.py
[root@node101 datax]# ll /usr/bin/datax.py
lrwxrwxrwx 1 root root 29 3月  12 11:12 /usr/bin/datax.py -> /usr/local/datax/bin/datax.py


测试:
从Stream流读取数流打印到控制台:
查看官方提高的模板
[root@node01 datax]# python  bin/datax.py  -r streamreader -w streamwriter
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
Please refer to the streamreader document:
     https://github.com/alibaba/DataX/blob/master/streamreader/doc/streamreader.md
Please refer to the streamwriter document:
     https://github.com/alibaba/DataX/blob/master/streamwriter/doc/streamwriter.md,k
Please save the following configuration as a json file and  use
   python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json
to run the job.
{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "streamreader",
                    "parameter": {
                        "column": [],
                        "sliceRecordCount": ""
                    }
                },
                "writer": {
                    "name": "streamwriter",
                    "parameter": {
                        "encoding": "",
                        "print": true
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": ""
            }
        }
    }
}


根据模板修改配置
[root@node01 datax]# vim job/stream2_stream.json
{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "streamreader",
                    "parameter": {
                        "sliceRecordCount": 10,
                        "column": [
                            {
                                "type": "long",
                                "value": "10"
                            },
                            {
                                "type": "string",
                                "value": "hello,DataX"
                            }
                        ]
                    }
                },
                "writer": {
                    "name": "streamwriter",
                    "parameter": {
                        "encoding": "UTF-8",
                        "print": true
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": 1
            }
        }
    }
}
执行脚本(部分日志)
[root@node101 datax]# pwd
/usr/local/datax
[root@node101 datax]# ls
bin  conf  job  lib  log  log_perf  plugin  script  tmp
[root@node101 datax]# ./bin/datax.py ./job/stream2_stream.json


DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved
2022-03-12 11:15:31.677 [main] INFO  VMInfo - VMInfo# operatingSystem class => sun.management.OperatingSystemImpl
2022-03-12 11:15:31.693 [main] INFO  Engine - the machine info  =>
        osInfo:        Red Hat, Inc. 1.8 25.322-b06
        jvmInfo:        Linux amd64 4.4.219-1.el7.elrepo.x86_64
        cpu num:        4
        totalPhysicalMemory:        -0.00G
        freePhysicalMemory:        -0.00G
        maxFileDescriptorCount:        -1
        currentOpenFileDescriptorCount:        -1
        GC Names        [PS MarkSweep, PS Scavenge]
        MEMORY_NAME                    | allocation_size                | init_size                     
        PS Eden Space                  | 256.00MB                       | 256.00MB                      
        Code Cache                     | 240.00MB                       | 2.44MB                        
        Compressed Class Space         | 1,024.00MB                     | 0.00MB                        
        PS Survivor Space              | 42.50MB                        | 42.50MB                       
        PS Old Gen                     | 683.00MB                       | 683.00MB                      
        Metaspace                      | -0.00MB                        | 0.00MB                       
2022-03-12 11:15:31.727 [main] INFO  Engine -
{
        "content":[
                {
                        "reader":{
                                "name":"streamreader",
                                "parameter":{
                                        "column":[
                                                {
                                                        "type":"long",
                                                        "value":"10"
                                                },
                                                {
                                                        "type":"string",
                                                        "value":"hello,DataX"
                                                }
                                        ],
                                        "sliceRecordCount":10
                                }
                        },
                        "writer":{
                                "name":"streamwriter",
                                "parameter":{
                                        "encoding":"UTF-8",
                                        "print":true
                                }
                        }
                }
        ],
        "setting":{
                "speed":{
                        "channel":1
                }
        }
}
2022-03-12 11:15:31.761 [main] WARN  Engine - prioriy set to 0, because NumberFormatException, the value is: null
2022-03-12 11:15:31.765 [main] INFO  PerfTrace - PerfTrace traceId=job_-1, isEnable=false, priority=0
2022-03-12 11:15:31.765 [main] INFO  JobContainer - DataX jobContainer starts job.
2022-03-12 11:15:31.771 [main] INFO  JobContainer - Set jobId = 0
2022-03-12 11:15:31.805 [job-0] INFO  JobContainer - jobContainer starts to do prepare ...
2022-03-12 11:15:31.807 [job-0] INFO  JobContainer - DataX Reader.Job [streamreader] do prepare work .
2022-03-12 11:15:31.809 [job-0] INFO  JobContainer - DataX Writer.Job [streamwriter] do prepare work .
2022-03-12 11:15:31.813 [job-0] INFO  JobContainer - jobContainer starts to do split ...
2022-03-12 11:15:31.814 [job-0] INFO  JobContainer - Job set Channel-Number to 1 channels.
2022-03-12 11:15:31.816 [job-0] INFO  JobContainer - DataX Reader.Job [streamreader] splits to [1] tasks.
2022-03-12 11:15:31.818 [job-0] INFO  JobContainer - DataX Writer.Job [streamwriter] splits to [1] tasks.
2022-03-12 11:15:31.858 [job-0] INFO  JobContainer - jobContainer starts to do schedule ...
2022-03-12 11:15:31.868 [job-0] INFO  JobContainer - Scheduler starts [1] taskGroups.
2022-03-12 11:15:31.875 [job-0] INFO  JobContainer - Running by standalone Mode.
2022-03-12 11:15:31.891 [taskGroup-0] INFO  TaskGroupContainer - taskGroupId=[0] start [1] channels for [1] tasks.
2022-03-12 11:15:31.900 [taskGroup-0] INFO  Channel - Channel set byte_speed_limit to -1, No bps activated.
2022-03-12 11:15:31.900 [taskGroup-0] INFO  Channel - Channel set record_speed_limit to -1, No tps activated.
2022-03-12 11:15:31.927 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] taskId[0] attemptCount[1] is started
10        hello,DataX
10        hello,DataX
10        hello,DataX
10        hello,DataX
10        hello,DataX
10        hello,DataX
10        hello,DataX
10        hello,DataX
10        hello,DataX
10        hello,DataX
2022-03-12 11:15:32.029 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] taskId[0] is successed, used[105]ms
2022-03-12 11:15:32.030 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] completed it's tasks.
2022-03-12 11:15:41.908 [job-0] INFO  StandAloneJobContainerCommunicator - Total 10 records, 130 bytes | Speed 13B/s, 1 records/s | Error 0 records, 0 bytes |  All Task WaitWriterTime 0.000s |  All Task WaitReaderTime 0.000s | Percentage 100.00%
2022-03-12 11:15:41.909 [job-0] INFO  AbstractScheduler - Scheduler accomplished all tasks.
2022-03-12 11:15:41.910 [job-0] INFO  JobContainer - DataX Writer.Job [streamwriter] do post work.
2022-03-12 11:15:41.911 [job-0] INFO  JobContainer - DataX Reader.Job [streamreader] do post work.
2022-03-12 11:15:41.935 [job-0] INFO  JobContainer - DataX jobId [0] completed successfully.
2022-03-12 11:15:41.937 [job-0] INFO  HookInvoker - No hook invoked, because base dir not exists or is a file: /usr/local/datax/hook
2022-03-12 11:15:41.942 [job-0] INFO  JobContainer -
         [total cpu info] =>
                averageCpu                     | maxDeltaCpu                    | minDeltaCpu                   
                -1.00%                         | -1.00%                         | -1.00%
         [total gc info] =>
                 NAME                 | totalGCCount       | maxDeltaGCCount    | minDeltaGCCount    | totalGCTime        | maxDeltaGCTime     | minDeltaGCTime    
                 PS MarkSweep         | 0                  | 0                  | 0                  | 0.000s             | 0.000s             | 0.000s            
                 PS Scavenge          | 0                  | 0                  | 0                  | 0.000s             | 0.000s             | 0.000s           
2022-03-12 11:15:41.943 [job-0] INFO  JobContainer - PerfTrace not enable!
2022-03-12 11:15:41.945 [job-0] INFO  StandAloneJobContainerCommunicator - Total 10 records, 130 bytes | Speed 13B/s, 1 records/s | Error 0 records, 0 bytes |  All Task WaitWriterTime 0.000s |  All Task WaitReaderTime 0.000s | Percentage 100.00%
2022-03-12 11:15:41.949 [job-0] INFO  JobContainer -
任务启动时刻                    : 2022-03-12 11:15:31
任务结束时刻                    : 2022-03-12 11:15:41
任务总计耗时                    :                 10s
任务平均流量                    :               13B/s
记录写入速度                    :              1rec/s
读出记录总数                    :                  10
读写失败总数                    :                   0 


如果有以上输出,说明DATAX安装配置完成
 

分割线
感谢打赏
江西数库信息技术有限公司
YWSOS.COM 平台代运维解决方案
 评论
  报错: 在有总bps限速条件下,单个channel的bps值不能为空,也不能为非正数 解决方法: https://blog.csdn.net/chrisy521/article/details/122033748 修改core -> transport -> channel -> speed -> "byte": 2000000,将单个channel的大小改为2MB即可。 [root@node101 conf]# pwd /usr/local/datax/conf [root@node101 conf]# vim core.json 24 "transport": { 25 "channel": { 26 "class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel", 27 "speed": { 28 "byte": -1, 29 "record": -1, 30 "byte": 2000000 #新增行

  报错: 在有总bps限速条件下,单个channel的bps值不能为空,也不能为非正数 解决方法: https://blog.csdn.net/chrisy521/article/details/122033748 修改core -> transport -> channel -> speed -> "byte": 2000000,将单个channel的大小改为2MB即可。 [root@node101 conf]# pwd /usr/local/datax/conf [root@node101 conf]# vim core.json 24 "transport": { 25 "channel": { 26 "class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel", 27 "speed": { 28 "byte": -1, 29 "record": -1, 30 "byte": 2000000 #新增行
 发表评论
姓   名:

Powered by AKCMS