go语言写的一个文件下载脚本
晚上想听点有声小说, 要两毛一集, 算了一下,一本书要几百块, 囊中羞涩, 于是拿出电脑搞个脚本
单协程下载
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
package main
import (
"bufio"
"io"
"log"
"net/http"
"os"
)
func download(title string) {
outputFile := "./output/" + title
// if file exists, skip
if _, err := os.Stat(outputFile); err == nil {
log.Println("File exists, skip: ", outputFile)
return
}
url := "https://马赛克xxxx/牧神记/" + title
log.Println("Downloading: ", url)
resp, err := http.Get(url)
if err != nil {
log.Println("error:" + err.Error())
}
defer func(Body io.ReadCloser) {
err := Body.Close()
if err != nil {
log.Println("close body error:" + err.Error())
}
}(resp.Body)
file, err := os.OpenFile(outputFile, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
panic(err)
}
defer func(file *os.File) {
err := file.Close()
if err != nil {
log.Println("close file error:" + err.Error())
}
}(file)
_, err = io.Copy(file, resp.Body)
if err != nil {
log.Println("copy file error:" + err.Error())
}
//貌似不会封IP, 如果封ip可以使用 https://github.com/Python3WebSpider/ProxyPool
//time.Sleep(time.Second * 3)
}
func main() {
//mp3list.txt 从网站主页复制的小说每一集的标题列表
//0001_天黑别出门.mp3
//0002_四灵血.mp3
//0003_神通.mp3
//...
//省略数千行
file, err := os.Open("mp3list.txt")
if err != nil {
panic(err)
}
defer func(file *os.File) {
_ = file.Close()
}(file)
//read file line by line
scanner := bufio.NewScanner(file)
for scanner.Scan() {
title := scanner.Text()
//download file
download(title)
}
}
|
多协程下载
感觉单协程太慢, 20个协程同时下载多爽… golang
实现起来超级简单
golang chan
搞一个队列
1
|
var queue = make(chan int, 20)
|
下载前向队列里放一个元素
1
2
3
|
//download file
queue <- 1
go download(title)
|
队列最多放20个就会阻塞
下载完成或skip时就从队列中弹出
1
2
3
4
5
6
|
// if file exists, skip
if _, err := os.Stat(outputFile); err == nil {
log.Println("File exists, skip: ", outputFile)
<-queue
return
}
|
1
2
|
//download finish
<-queue
|
队列为空时, 表示下载完成, 程序退出
1
2
3
4
5
6
7
|
for true {
time.Sleep(time.Second * 2)
//if queue is empty, exit
if len(queue) == 0 {
break
}
}
|
完整代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
package main
import (
"bufio"
"io"
"log"
"net/http"
"os"
"time"
)
var queue = make(chan int, 20)
func download(title string) {
outputFile := "./output/" + title
// if file exists, skip
if _, err := os.Stat(outputFile); err == nil {
log.Println("File exists, skip: ", outputFile)
<-queue
return
}
url := "https://马赛克xxxx/牧神记/" + title
log.Println("Downloading: ", url)
resp, err := http.Get(url)
if err != nil {
log.Println("error:" + err.Error())
}
defer func(Body io.ReadCloser) {
err := Body.Close()
if err != nil {
log.Println("close body error:" + err.Error())
}
}(resp.Body)
file, err := os.OpenFile(outputFile, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
panic(err)
}
defer func(file *os.File) {
err := file.Close()
if err != nil {
log.Println("close file error:" + err.Error())
}
}(file)
_, err = io.Copy(file, resp.Body)
if err != nil {
log.Println("copy file error:" + err.Error())
}
//貌似不会封IP, 如果封ip可以使用 https://github.com/Python3WebSpider/ProxyPool
//time.Sleep(time.Second * 3)
//download finish
<-queue
}
func main() {
//open file mp3list.txt
file, err := os.Open("mp3list.txt")
if err != nil {
panic(err)
}
defer func(file *os.File) {
_ = file.Close()
}(file)
//read file line by line
scanner := bufio.NewScanner(file)
for scanner.Scan() {
title := scanner.Text()
//download file
queue <- 1
go download(title)
}
for true {
time.Sleep(time.Second * 2)
//if queue is empty, exit
if len(queue) == 0 {
break
}
}
}
|
代理池
我本次遇到的网站并没有WAF
(Web Application Firewall), 所以可以狂下载
如果遇到被封IP的情况, 可以使用代理池:
https://github.com/Python3WebSpider/ProxyPool
深入理解
欢迎阅读本博客中的 go语言中channel是如何工作的