Deprecated. Migrated to WikiTrend_2.0
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

235 lines
7.1 KiB

# Author: Jaeha Choi
# WikiPageview.py pre-process and create datasets.
import ast
import csv
import datetime
import time
from itertools import repeat
from multiprocessing import Pool
import pandas as pd
import pymysql as mariadb
pd.options.mode.chained_assignment = None
WORKING_DIR = "E:/Projects/WikiTrend/"
FILE_N = "2020-06"
WINDOW_SIZE = 4
INPUT_DIM = 50000
auth = open(WORKING_DIR + "/.auth", "r", encoding="utf-8")
db_usr = auth.readline().strip()
db_pw = auth.readline().strip()
db_name = auth.readline().strip()
auth.close()
db = mariadb.connect(
host='localhost',
port=3306,
user=db_usr,
passwd=db_pw,
db=db_name,
charset='utf8mb4',
autocommit=True
)
cursor = db.cursor()
all_dict = {}
def get_from_db(date: datetime.datetime):
cursor.execute("SELECT * FROM `" + str(date.date()) + "`")
records = cursor.fetchall()
print("Date: " + str(date.date()) + "\tLength: " + str(len(records)))
for row in records:
daily = 0
# if not -1 in row: # replace -1 with 5?
for hourly in row[1:]:
if hourly == -1:
daily += 10
else:
daily += hourly
all_dict.setdefault(row[0], {})
all_dict[row[0]][str(date.date())] = daily
def create_all_dict(start_date: datetime.datetime, end_date: datetime.datetime):
curr_date = start_date
while curr_date <= end_date:
get_from_db(curr_date)
curr_date += datetime.timedelta(days=1)
def read_raw_csv(raw_filename):
raw_file = open(raw_filename, "r", encoding="utf-8")
all_dict = ast.literal_eval(raw_file.readline().strip())
print("all_dict loaded")
raw_file.close()
return all_dict
def create_csv(start_date: datetime.datetime, end_date: datetime.datetime):
filename = WORKING_DIR + FILE_N + ".csv"
raw_filename = WORKING_DIR + FILE_N + ".raw.csv"
curr_date = start_date
col = "word\t"
# pool_list = []
# single thread, single processor: 7.2879811445871985 min
# multithread, single processor: 9.481982696056367 min
while curr_date <= end_date:
col += str(curr_date.date()) + "\t"
get_from_db(curr_date)
# pool_list.append(curr_date)
curr_date += datetime.timedelta(days=1)
# with Pool(processes=7) as pool:
# pool.map(get_from_db,pool_list)
file = open(filename, "w", encoding="utf-8")
file.write(col + "average\n")
for word in all_dict:
file.write(word + "\t")
curr_date = start_date
total = 0
total_data = 0
while curr_date <= end_date:
if str(curr_date.date()) in all_dict[word]:
file.write(str(all_dict[word][str(curr_date.date())]) + "\t")
total += all_dict[word][str(curr_date.date())]
total_data += 1
else:
file.write("-1" + "\t")
curr_date += datetime.timedelta(days=1)
if total_data != 0:
file.write(str(total / total_data) + "\n")
else:
file.write("\n")
file.close()
print("csv file created.")
raw_file = open(raw_filename, "w", encoding="utf-8")
raw_file.write(str(all_dict))
raw_file.close()
print("raw.csv file created.")
print()
def pool_execute(df, word_df, date: str):
dataset = []
temp = df.sort_values(date, ascending=False)
# print(temp)
# print(temp.index.values[3078])
# print(word_df[3075])
for temp_idx, temp_word in enumerate(temp.index.values):
# print status every 5%
if temp_idx % (INPUT_DIM * 0.1) == 0:
print(str(datetime.datetime.now()) + "\t" + date + "\t\t\t" + str(
temp_idx / (INPUT_DIM * 0.01)) + "%\t\t\t" + str(temp_idx))
for i in range(WINDOW_SIZE * 2 + 1):
i -= WINDOW_SIZE
if i == 0:
pass
elif 0 <= temp_idx + i < INPUT_DIM:
# print(temp.index.values[temp_idx+i])
try:
dataset.append([word_df.get_loc(temp_word), word_df.get_loc(temp.index.values[temp_idx + i])])
except:
print(temp_word)
print("Temp idx: " + str(temp_idx) + "\ti: " + str(i))
print(str(temp.index.values[temp_idx + i]) + "," + str(temp_idx + i))
print()
print(str(datetime.datetime.now()) + "\t" + date + "\t\t\t" + "100.0%\t\t\t" + str(INPUT_DIM))
return dataset,date
def preprocess():
df = pd.read_csv(FILE_N + ".csv", encoding="utf-8", sep="\t", index_col=0)
# TODO: consider adding more normalization
def df_delta():
col_list = df.columns.values
# print(df.head(5))
prev = df[col_list[0]].copy()
del df[col_list[0]]
# exclude first data and average (last data)
for date in col_list[1:-1]:
updated = df[date] - prev
prev = df[date].copy()
df[date] = updated
def df_avg_delta():
col_list = df.columns.values
# print(df.head(5))
avg = df["average"]
# exclude average (last data)
for date in col_list[:-1]:
df[date] -= avg
def df_avg_delta_normalize():
col_list = df.columns.values
avg = df["average"]
# exclude average (last data)
for word_idx in range(INPUT_DIM):
for date1 in col_list[:-1]:
avg1 = avg[word_idx]
value = df[date1][word_idx]
modifier = len(str(avg1)) * 10
df[date1][word_idx] = (value - avg1)/avg1 * modifier
# print("Average:",str(avg1))
# print("Values:",str(value))
# print("Output:",str(df[date][word_idx]))
if word_idx % 1000 == 0:
print("Word idx:" + str(word_idx))
df_avg_delta_normalize()
df = df.sort_values("average", ascending=False)
df = df.head(INPUT_DIM)
word_df = df.index.copy()
print("Creating training dataset")
print("Time\t\t\t\t\t\tProcessing Date\t\tPercentage\t\tCurrent Index")
pool_list = []
for date in df.columns.values[:-1]: # last col (avg) is excluded.
# for date in df.columns.values[0:1]: # for testing only
pool_list.append(date)
results = Pool(processes=7).starmap(pool_execute, zip(repeat(df), repeat(word_df), pool_list))
# dataset = []
for result in results:
# dataset.append(result)
print("Writing: dataset-"+str(result[1])+".csv")
with open(WORKING_DIR + "output/dataset-"+str(result[1])+".csv", "w", encoding="utf-8", newline="") as out_file:
writer = csv.writer(out_file)
writer.writerows(result[0])
# Create a csv file containing word index
print("Creating index file")
word_df.to_series().to_csv(WORKING_DIR + "output/word_idx6.csv",encoding="utf-8", index=False, header=False)
def main():
s_date = datetime.datetime(2020, 6, 1)
e_date = datetime.datetime(2020, 6, 3)
# create_csv(s_date,e_date)
preprocess()
if __name__ == '__main__':
prog_start = time.time()
main()
print("Total time elapsed: " + str((time.time() - prog_start) / 60) + " minutes")