抓取网页的脚本

先上脚本    
[root@mysql scripts]# cat html_to_table.sh        
#!/bin/bash
# oldboy linux training
# 2015-06-01
# Happy Children's Day
# 说明:本脚本来自老男孩linux21期学员张耀开发!
EduFile=/tmp/edu.html
EduFile2=/tmp/edu2.html
Url="$*"
 
# Check for given parameters
[ $# -eq 0 ] && {
   echo "USAGE: /bin/sh $0 http://...."
   exit 1
}
 
# Judge url is ok?
curl -I $Url &>/dev/null
[ $? -ne 0 ] &&{
   echo "Bad url,Please check it"
   exit 1
}
 
# Defined get pagenum and CourseId Functions
function getnum(){
        curl -s $Url>$EduFile
        grep '"pagesGoEnd"' $EduFile &>/dev/null
        if [ $? -eq 0 ]
          then
            num=`sed -rn 's#.*page=([0-9].*)" class="pagesGoEnd".*$#\1#gp' $EduFile`
        else
            num=`sed -rn 's|.*page=([0-9].*)#" class="pagesNum".*$|\1|gp' $EduFile`
        fi
        pagenum=${num:-1}
        CourseId=`echo $Url|awk -F "[-.]" '{print $4}'`
}
 
 
# Defined curl html Functions
function Curl(){
        getnum
        for i in `seq $pagenum`
          do
            curl "http://edu.51cto.com/index.php?do=course&m=lessions&course_id=$CourseId&page=$i" 1>>$EduFile 2>/dev/null
        done
}
 
# Defined Create table Functions
function table(){
        sum=""
        index=1
        sed -rn '/do=lesson/ s#<.*(<a href=")(.*)</h4>#\1http://edu.51cto.com\2#gp' $EduFile > $EduFile2
        while read line
          do
            sum=$sum"<tr><th width="40" scope="row">$index</th><td width="520">$line</td>"
            ((index++))
        done <$EduFile2
}
 
# Defined Create html Functions
function html(){
        cat >/tmp/oldboy.html<<-END
        <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <title>test</title>
        </head>
 
        <body>
        <table width="560" border="1">
        $sum
        </table>
        </body>
        </html>
        END
}
 
function main(){
        Curl
        table
        html
}
 
main


测试网页http://edu.51cto.com/course/course_id-839.html
1
2
    
[root@mysql scripts]# sh html_to_table.sh http://edu.51cto.com/course/course_id-839.html
[root@mysql scripts]# sz /tmp/oldboy.html

我将网页文件的table部分粘贴上来

分割线
感谢打赏
江西数库信息技术有限公司
YWSOS.COM 平台代运维解决方案
 评论
 发表评论
姓   名:

Powered by AKCMS