背景
有一個工作郵箱,會接收許多人不斷地投遞的簡歷。由于郵件數量比較多,因此產生了一個需求。要求自動將郵件從郵件服務器取回到本地,并將郵件的基本信息存入本地的sqlite數據庫。郵件的正文以單獨文件的形式存放在文件夾下。
實現
備注:在python2.7下測試運行正常,如果用python3,可能需要對代碼稍做修改。
1,郵件配置參數文件
mail.conf
[
mail163
]
#此外應寫上你實際的帳號與密碼
user
=
xxxx@
163.
com
password
=
xxxxx
pop3_server
=
pop3
.
163.
com
[
sqlite
]
dir
=
sqlite
fileName
=
mailLog
.
db
2.sqlite數據表的結構
3.從郵件服務器收取郵件的python代碼
mailManager.py
# -*- coding:utf-8 -*-
# 讀取郵件并解碼存入日志數據庫
import
poplib
import
email
import
ConfigParser
import
os
,
sys
,
string
,
time
from
email
.
parser
import
Parser
from
email
.
header
import
decode_header
from
email
.
utils
import
parseaddr
from
logHelper
import
LogHelper
#獲取解碼后的郵件體
def
getBody
(
msg
,
guessedCharset
=
'gb2312'
)
:
bodyText
=
''
if
(
msg
.
is_multipart
(
)
)
:
parts
=
msg
.
get_payload
(
)
for
n
,
part
in
enumerate
(
parts
)
:
try
:
bodyText
+=
getBody
(
part
)
except
UnicodeDecodeError
,
e
:
print
e
.
message
else
:
content_type
=
msg
.
get_content_type
(
)
if
content_type
==
'text/plain'
or
content_type
==
'text/html'
:
content
=
msg
.
get_payload
(
decode
=
True
)
#嘗試進行解碼
bodyText
=
decodeString
(
content
,
guessedCharset
,
'body'
,
guessedCharset
,
)
else
:
bodyText
=
''
return
bodyText
#解碼郵件頭中包含的字符串
def
decode_strInHeader
(
s
,
guessedCharset
=
'gb2312'
)
:
#對郵件頭字符串,獲取其內容與編碼格式
value
,
charset
=
decode_header
(
s
)
[
0
]
#print(value,charset)
return
decodeString
(
value
,
charset
,
'header'
,
guessedCharset
)
#解碼字符串
def
decodeString
(
s
,
charset
,
extra
=
'header'
,
guessedCharset
=
'gb2312'
)
:
value
=
s
if
charset
is
None
:
charset
=
guessedCharset
if
charset
:
#去除編碼格式中可能存在的干擾元素,常見的是雙引號
charset
=
charset
.
strip
(
'"'
)
charset
=
charset
.
strip
(
"'"
)
try
:
value
=
value
.
decode
(
charset
)
except
:
if
(
charset
==
'gb2312'
)
:
#嘗試用比gb2312更大的字符集gbk進行解碼
try
:
value
=
value
.
decode
(
'gbk'
)
except
:
print
(
"decode error in decodeString!"
,
'gbk'
,
extra
)
elif
(
charset
==
'utf8'
)
:
#嘗試忽略掉解碼錯誤
try
:
value
=
value
.
decode
(
'utf8'
,
errors
=
'ignore'
)
except
:
print
(
"decode error in decodeString!"
,
'gbk'
,
extra
)
else
:
#從目前解碼郵件的實踐來看,如果不是gb231編碼,就是utf-8編碼
print
(
"decode error in decodeString!"
,
charset
,
extra
)
return
value
#獲得msg的編碼,猜測編碼格式
def
guess_charset
(
msg
)
:
charset
=
msg
.
get_charset
(
)
if
charset
is
None
:
content_type
=
msg
.
get
(
'Content-Type'
,
''
)
.
lower
(
)
pos
=
content_type
.
find
(
'charset='
)
if
pos
>=
0
:
charset
=
content_type
[
pos
+
8
:
]
.
strip
(
)
return
charset
#當前日期字符串
def
today
(
)
:
return
time
.
strftime
(
"%Y-%m-%d"
,
time
.
localtime
(
)
)
#確保文件夾存在
def
ensureDir
(
dir
)
:
if
not
os
.
path
.
exists
(
dir
)
:
os
.
mkdir
(
dir
)
#登記一封郵件
def
logOneMail
(
server
,
index
,
dir
,
logHelper
,
parseScope
=
'new'
)
:
print
(
'log Mail:'
,
index
)
resp
,
lines
,
octets
=
server
.
retr
(
index
)
# lines存儲了郵件的原始文本的每一行,合并得到原始文本
msgRaw
=
b
'\r\n'
.
join
(
lines
)
#創建message對象,這個時候也會做基本的解碼,得到message結構體
msg
=
email
.
message_from_string
(
msgRaw
)
#在需要時,可輸出整個message結構體,觀察有哪些鍵值對
#print msg
#推測郵件的編碼格式
guessedCharset
=
guess_charset
(
msg
)
#如果subject存在就返回相應的值,否則返回''
subjectRaw
=
msg
.
get
(
"subject"
,
''
)
subject
=
decode_strInHeader
(
subjectRaw
,
guessedCharset
)
#print subject
fromAddrRaw
=
msg
.
get
(
"from"
,
''
)
var1
,
var2
=
parseaddr
(
fromAddrRaw
)
fromAddr
=
decode_strInHeader
(
var1
,
guessedCharset
)
#print fromAddr
toAddrRaw
=
msg
.
get
(
"to"
,
''
)
var1
,
var2
=
parseaddr
(
toAddrRaw
)
toAddr
=
decode_strInHeader
(
var1
,
guessedCharset
)
#print toAddr
messageIDRaw
=
msg
.
get
(
"Message-ID"
,
''
)
;
messageID
=
decode_strInHeader
(
messageIDRaw
,
guessedCharset
)
print
(
'mail message id:'
,
messageID
)
uniqueIDRaw
=
msg
.
get
(
"uniqueid"
,
''
)
;
uniqueID
=
decode_strInHeader
(
uniqueIDRaw
,
guessedCharset
)
#print uniqueID
dateStrRaw
=
msg
.
get
(
"Date"
,
''
)
;
dateStr
=
decode_strInHeader
(
dateStrRaw
,
guessedCharset
)
#print dateStr
#將郵件主體內容寫入文件
baseName
=
messageID
.
strip
(
)
;
baseName
=
baseName
.
replace
(
'<'
,
''
)
baseName
=
baseName
.
replace
(
'>'
,
''
)
#以日期為文件夾,存放郵件正文
curDir
=
dir
+
'/'
+
today
(
)
+
'/'
ensureDir
(
curDir
)
contentFile
=
curDir
+
'/'
+
baseName
+
'.html'
if
not
os
.
path
.
exists
(
contentFile
)
:
outFile
=
open
(
contentFile
,
'w'
)
outFile
.
write
(
getBody
(
msg
,
guessedCharset
)
)
outFile
.
close
(
)
#檢查是否到了解析范圍的結尾處
if
parseScope
==
'new'
and
logHelper
.
msgExists
(
messageID
)
:
return
'scopeEnd'
#將郵件信息寫入日志數據庫
logHelper
.
append
(
messageID
,
fromAddr
,
subject
,
contentFile
,
dateStr
)
return
'ok'
#登記郵件,從郵件服務器中取出最近的一些郵件,
#parseScope='all',則取出所有的郵件,'new',取出新收到的郵件,或者取出只定數量的新郵件
#progressKey是批操作計數器的標識值,目前保留備用
def
logTheMails
(
progressKey
,
parseScope
=
'new'
)
:
#讀取配置文件
cf
=
ConfigParser
.
ConfigParser
(
)
cf
.
read
(
"mail.conf"
)
user
=
cf
.
get
(
"mail163"
,
"user"
)
password
=
cf
.
get
(
"mail163"
,
"password"
)
pop3_server
=
cf
.
get
(
"mail163"
,
"pop3_server"
)
# 連接到POP3服務器:
server
=
poplib
.
POP3
(
pop3_server
)
# 可以打開或關閉調試信息:
#server.set_debuglevel(1)
# 打印POP3服務器的歡迎消息:
#print(server.getwelcome())
# 身份認證:
server
.
user
(
user
)
server
.
pass_
(
password
)
#stat()返回郵件數量和占用空間:
#print('Messages: %s. Size: %s' % server.stat())
#連接日志數據庫
dbFileFullName
=
cf
.
get
(
"sqlite"
,
"dir"
)
+
'/'
+
cf
.
get
(
"sqlite"
,
"fileName"
)
logHelper
=
LogHelper
(
dbFileFullName
)
# list()返回所有郵件的編號:
resp
,
mails
,
octets
=
server
.
list
(
)
#郵件服務器郵箱中的郵件總數
total
=
len
(
mails
)
if
parseScope
==
'all'
:
logCount
=
total
elif
parseScope
==
'new'
:
logCount
=
total
else
:
logCount
=
int
(
parseScope
)
# 獲取最近的 logCount 份郵件, 注意索引號從1開始,最新的索引是len(mails):
receivedCount
=
0
for
indexAsc
in
range
(
0
,
logCount
)
:
index
=
total
-
indexAsc
#登記一封郵件
flag
=
logOneMail
(
server
,
index
,
cf
.
get
(
"sqlite"
,
"dir"
)
,
logHelper
,
parseScope
)
if
flag
==
'scopeEnd'
:
break
receivedCount
+=
1
# 關閉到郵件服務器的連接:
server
.
quit
(
)
return
receivedCount
#登記郵件,取出起始索引號與結束索引號之間的一些郵件
def
logMailsByIndex
(
beginIndex
,
endIndex
)
:
#讀取配置文件
cf
=
ConfigParser
.
ConfigParser
(
)
cf
.
read
(
"mail.conf"
)
user
=
cf
.
get
(
"mail163"
,
"user"
)
password
=
cf
.
get
(
"mail163"
,
"password"
)
pop3_server
=
cf
.
get
(
"mail163"
,
"pop3_server"
)
# 連接到POP3服務器:
server
=
poplib
.
POP3
(
pop3_server
)
# 可以打開或關閉調試信息:
#server.set_debuglevel(1)
# 打印POP3服務器的歡迎消息:
#print(server.getwelcome())
# 身份認證:
server
.
user
(
user
)
server
.
pass_
(
password
)
#stat()返回郵件數量和占用空間:
#print('Messages: %s. Size: %s' % server.stat())
#連接日志數據庫
dbFileFullName
=
cf
.
get
(
"sqlite"
,
"dir"
)
+
'/'
+
cf
.
get
(
"sqlite"
,
"fileName"
)
logHelper
=
LogHelper
(
dbFileFullName
)
# list()返回所有郵件的編號:
resp
,
mails
,
octets
=
server
.
list
(
)
#郵件服務器郵箱中的郵件總數
total
=
len
(
mails
)
if
beginIndex
>
total
:
beginIndex
=
total
if
endIndex
>
total
:
endIndex
=
total
# 獲取最近的 logCount 份郵件, 注意索引號從1開始,最新的索引是len(mails):
receivedCount
=
0
for
index
in
range
(
beginIndex
,
endIndex
+
1
)
:
#登記一封郵件
flag
=
logOneMail
(
server
,
index
,
cf
.
get
(
"sqlite"
,
"dir"
)
,
logHelper
)
if
flag
==
'scopeEnd'
:
break
receivedCount
+=
1
# 關閉到郵件服務器的連接:
server
.
quit
(
)
return
receivedCount
4.根據命令行參數,讀取指定時間范圍內的郵件的代碼
fetchMails.py
# -*- coding:utf-8 -*-
#讀取郵件
import
os
,
sys
,
string
import
time
import
getopt
import
mailManager
reload
(
sys
)
sys
.
setdefaultencoding
(
"utf-8"
)
#解析命令行參數,得到進度計數器的key和郵箱代號(留作備用)
#scope指示是解析全部郵件(all)還是只解析新收到的郵件(new)
#如果給出一個數字,則解析最近收到的指定數目的郵件
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:
]
,
'p:m:s:'
,
[
'progKey='
,
'mailBoxIdx='
,
'scope='
]
)
except
getopt
.
GetoptError
:
print
(
'error:'
,
'options invalid'
)
sys
.
exit
(
)
progressKey
=
''
parseScope
=
'new'
for
k
,
v
in
opts
:
if
k
in
(
"-p"
,
"--progKey"
)
:
progressKey
=
v
elif
k
in
(
"-m"
,
"--mailBoxIdx"
)
:
mailBoxIndex
=
int
(
v
)
elif
k
in
(
"-s"
,
"--scope"
)
:
parseScope
=
v
print
(
'oldCwd:'
,
os
.
getcwd
(
)
)
#將工作目錄切換到當前文件所在的目錄
os
.
chdir
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)
)
)
print
(
'newCwd:'
,
os
.
getcwd
(
)
)
print
print
(
'fetch mails : begin...'
)
print
startTime
=
time
.
time
(
)
if
progressKey
==
''
:
progressKey
=
'tempKey1'
#取回郵件并登記到sqlite數據庫
receivedCount
=
mailManager
.
logTheMails
(
progressKey
,
parseScope
)
print
print
(
'receivedCount:'
,
receivedCount
)
print
endTime
=
time
.
time
(
)
print
(
'used time/minutes: '
,
(
endTime
-
startTime
)
/
60
)
更多文章、技術交流、商務合作、聯系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
