blog

task2-Crawl OFFICE365、ZOOMサーバーのIPアドレスと対応するプロトコルのポート番号。

``` from urllib import request from bs4 import pymysql import time from seleniu...

Aug 15, 2020 · 4 min. read
シェア
from urllib import request
from bs4 import BeautifulSoup
import pymysql
import time
from selenium import webdriver
# データベースへの接続を作成し、ここでcharsetとcursorclassパラメータを追加したことに注意する。
conn = pymysql.connect(
 host=".1",
 user="root",
 password="",
 database="spider",
 charset='utf8',
 cursorclass=pymysql.cursors.DictCursor)
# カーソルを取得する
cursor = conn.cursor()
def office365_spider():
 url1 = "https://..com/en-us/office365/enterprise/office-365-u-s-government-gcc-high-endpoints"
 url2 = "https://..com/en-us/office365/enterprise/office-365-u-s-government-dod-endpoints"
 url3 = "https://..com/en-us/office365/enterprise/urls-and-ip-address-ranges"
 url4 = "https://..com/en-us/office365/enterprise/urls-and-ip-address-ranges-21vianet"
 url5 = "https://..com/en-us/office365/enterprise/office-365-germany-endpoints"
 url = [url1, url2, url3, url4, url5]
 # print(url)
 for case in url:
 # print(case)
 html = request.urlopen(case).read().decode('utf-8')
 soup = BeautifulSoup(html, 'html.parser')
 # テーブルの各行を取得する
 select = soup.find_all('td')
 # for s in select:
 # print(s)
 a = []
 for i in range(len(select)):
 if i == 0 or i == 1:
 continue
 elif case == url3 and i == 2:
 continue
 else:
 a.append(select[i].text)
 b = [a[i:i + 5] for i in range(0, len(a), 5)]
 # company IP Port Protocol 
 try:
 for item in b:
 # print(item)
 if len(item) != 5:
 continue
 item[3] = item[3].replace(",", ",")
 ip = item[3].split(",")
 item[4] = item[4].replace(":", ":")
 protocol_and_port = item[4].split(":")
 if len(protocol_and_port) < 2:
 continue
 protocol_tcp = protocol_and_port[0]
 port_tcp = protocol_and_port[1]
 protocol_udp = ""
 port_udp = ""
 if port_tcp.find("UDP") != -1:
 protocol_udp = "UDP"
 port_tcp = port_tcp.replace("UDP", "")
 port_udp = protocol_and_port[2]
 for i in ip:
 if i.find("us") != -1 and i.find("/") != -1:
 continue
 if i.find(".com") != -1 and i.find("/") != -1:
 continue
 if len(protocol_udp) != 0:
 print("OFFICE365-" + str(i) + "-" + protocol_udp + "-" + port_udp)
 cursor.execute(
 "INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
 ('OFFICE365', i, protocol_udp, port_udp))
 print("OFFICE365-" + str(i) + "-" + protocol_tcp + "-" + port_tcp)
 cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
 ('OFFICE365', i, protocol_tcp, port_tcp))
 conn.commit()
 except Exception as e:
 print(e)
def zoom_spider():
 url = "https://..us/hc/en-us/articles/201362683-Network-firewall-or-proxy-server-settings-for-Zoom"
 print(url)
 html = request.urlopen(url).read().decode('utf-8')
 soup = BeautifulSoup(html, 'html.parser')
 # テーブルの各行を取得する
 select = soup.find_all('td')
 index = [4, 8, 12, 20, 24, 28, 32, 40]
 a = []
 for i in range(len(select)):
 if not (i in index):
 continue
 else:
 for k in range(0, 4):
 a.append(select[i].text)
 i += 1
 b = [a[i:i + 4] for i in range(0, len(a), 4)]
 # company IP Port Protocol 
 k = 0
 for item in b:
 if len(item) != 4:
 continue
 k += 1
 # print(item)
 item[1] = item[1].replace("
", "").replace("\xa0-", ",").replace(" (see note)", "")
 item[3] = item[3].replace("
", "").replace("IPv4:", " ").replace("IPv6:", " ")
 if k == 5:
 item[3] = item[3].replace("32", "32 ")
 protocol = item[0]
 port = item[1]
 ip_list = item[3].split(" ")
 # print(ip_list)
 # print(item)
 # print("--------------------------------------")
 try:
 for ip in ip_list:
 if len(ip) == 0:
 continue
 if ip.count("/") > 1:
 continue
 print("ZOOM-" + ip + "-" + protocol + "-" + port)
 cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
 ('ZOOM', ip, protocol, port))
 conn.commit()
 except Exception as e:
 print(e)
def salesforce_spider():
 url = "https://..com/articleView?id=000321501&type=1&mode=1"
 # インターフェイスレス動作のパラメーター
 chrome_options = webdriver.ChromeOptions()
 chrome_options.add_argument('--no-sandbox')
 chrome_options.add_argument('--window-size=1420,1080')
 chrome_options.add_argument('--headless')
 chrome_options.add_argument('--disable-gpu')
 chrome_options.add_argument('--disable-dev-shm-usage')
 driver = webdriver.Chrome(options=chrome_options)
 driver.get(url)
 driver.implicitly_wait(10)
 time.sleep(2)
 row = driver.find_elements_by_tag_name('tr')
 source = driver.page_source
 soup = BeautifulSoup(source, 'html.parser')
 select = soup.find_all('td')
 index = 0
 try:
 for s in select:
 index += 1
 if index < 5 or index > 374:
 continue
 if s.text.find(".") < 3:
 continue
 ip = s.text
 cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
 ('SALESFORCE', ip, "TCP/UDP", "ANY"))
 conn.commit()
 except Exception as e:
 print(e)
 driver.quit()
def main():
 # sprider 3 application
 office365_spider()
 zoom_spider()
 salesforce_spider()
 # close cursor and connection
 cursor.close()
 conn.close()
if __name__ == '__main__':
 main()
Read next