Contents

Golang文件下载

go语言写的一个文件下载脚本

晚上想听点有声小说, 要两毛一集, 算了一下,一本书要几百块, 囊中羞涩, 于是拿出电脑搞个脚本

单协程下载

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
package main

import (
	"bufio"
	"io"
	"log"
	"net/http"
	"os"
)

func download(title string) {

	outputFile := "./output/" + title
	// if file exists, skip
	if _, err := os.Stat(outputFile); err == nil {
		log.Println("File exists, skip: ", outputFile)
		return
	}

	url := "https://马赛克xxxx/牧神记/" + title

	log.Println("Downloading: ", url)

	resp, err := http.Get(url)
	if err != nil {
		log.Println("error:" + err.Error())
	}

	defer func(Body io.ReadCloser) {
		err := Body.Close()
		if err != nil {
			log.Println("close body error:" + err.Error())
		}
	}(resp.Body)

	file, err := os.OpenFile(outputFile, os.O_CREATE|os.O_WRONLY, 0644)
	if err != nil {
		panic(err)
	}
	defer func(file *os.File) {
		err := file.Close()
		if err != nil {
			log.Println("close file error:" + err.Error())
		}
	}(file)

	_, err = io.Copy(file, resp.Body)
	if err != nil {
		log.Println("copy file error:" + err.Error())
	}
	//貌似不会封IP, 如果封ip可以使用 https://github.com/Python3WebSpider/ProxyPool
	//time.Sleep(time.Second * 3)
}

func main() {
	//mp3list.txt 从网站主页复制的小说每一集的标题列表
  //0001_天黑别出门.mp3
	//0002_四灵血.mp3
	//0003_神通.mp3
  //...
  //省略数千行
	file, err := os.Open("mp3list.txt")
	if err != nil {
		panic(err)
	}
	defer func(file *os.File) {
		_ = file.Close()
	}(file)

	//read file line by line
	scanner := bufio.NewScanner(file)
	for scanner.Scan() {
		title := scanner.Text()
		//download file
		download(title)
	}

}

多协程下载

感觉单协程太慢, 20个协程同时下载多爽… golang实现起来超级简单

golang chan

搞一个队列

1
var queue = make(chan int, 20)

下载前向队列里放一个元素

1
2
3
		//download file
		queue <- 1
		go download(title)

队列最多放20个就会阻塞

下载完成或skip时就从队列中弹出

1
2
3
4
5
6
	// if file exists, skip
	if _, err := os.Stat(outputFile); err == nil {
		log.Println("File exists, skip: ", outputFile)
		<-queue
		return
	}
1
2
	//download finish
	<-queue

队列为空时, 表示下载完成, 程序退出

1
2
3
4
5
6
7
	for true {
		time.Sleep(time.Second * 2)
		//if queue is empty, exit
		if len(queue) == 0 {
			break
		}
	}

完整代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package main

import (
	"bufio"
	"io"
	"log"
	"net/http"
	"os"
	"time"
)

var queue = make(chan int, 20)

func download(title string) {

	outputFile := "./output/" + title
	// if file exists, skip
	if _, err := os.Stat(outputFile); err == nil {
		log.Println("File exists, skip: ", outputFile)
		<-queue
		return
	}

	url := "https://马赛克xxxx/牧神记/" + title

	log.Println("Downloading: ", url)

	resp, err := http.Get(url)
	if err != nil {
		log.Println("error:" + err.Error())
	}

	defer func(Body io.ReadCloser) {
		err := Body.Close()
		if err != nil {
			log.Println("close body error:" + err.Error())
		}
	}(resp.Body)

	file, err := os.OpenFile(outputFile, os.O_CREATE|os.O_WRONLY, 0644)
	if err != nil {
		panic(err)
	}
	defer func(file *os.File) {
		err := file.Close()
		if err != nil {
			log.Println("close file error:" + err.Error())
		}
	}(file)

	_, err = io.Copy(file, resp.Body)
	if err != nil {
		log.Println("copy file error:" + err.Error())
	}
	//貌似不会封IP, 如果封ip可以使用 https://github.com/Python3WebSpider/ProxyPool
	//time.Sleep(time.Second * 3)

	//download finish
	<-queue
}

func main() {
	//open file mp3list.txt
	file, err := os.Open("mp3list.txt")
	if err != nil {
		panic(err)
	}
	defer func(file *os.File) {
		_ = file.Close()
	}(file)

	//read file line by line
	scanner := bufio.NewScanner(file)

	for scanner.Scan() {
		title := scanner.Text()
		//download file
		queue <- 1
		go download(title)
	}

	for true {
		time.Sleep(time.Second * 2)
		//if queue is empty, exit
		if len(queue) == 0 {
			break
		}
	}

}

代理池

我本次遇到的网站并没有WAF (Web Application Firewall), 所以可以狂下载

如果遇到被封IP的情况, 可以使用代理池:

https://github.com/Python3WebSpider/ProxyPool

深入理解

欢迎阅读本博客中的 go语言中channel是如何工作的