import pandas as pd import datetime as dt import numpy as np #importing square data sales_2019 = pd.read_csv('data/items-2019-01-01-2020-01-01.csv') sales_2020 = pd.read_csv('data/items-2020-01-01-2021-01-01.csv') sales_2021 = pd.read_csv('data/items-2021-01-01-2022-01-01.csv') sales_2022 = pd.read_csv('data/items-2022-01-01-2023-01-01.csv') sales_2023 = pd.read_csv('data/items-2023-01-01-2024-01-01.csv') df_list = [sales_2019, sales_2020, sales_2021, sales_2022, sales_2023]Sales data for the years in question is imported, converted into dataframes and the resulting dataframes are put into a list.
monetary_columns = ['Gross Sales', 'Discounts', 'Net Sales', 'Tax'] for df in df_list: for col in monetary_columns: #fixing monetary values df[col] = df[col].str.replace(',', '', regex=True) df[col] = df[col].str.replace('$', '', regex=True) df[col] = df[col].astype('float') #changing applicable columns to datetime df['Date'] = pd.to_datetime(df['Date']) # df['Time'] = pd.to_datetime(df['Time'], infer_datetime_format=True) #adding new columns based off datetime df['year'] = df['Date'].dt.year df['month'] = df['Date'].dt.month df['month_name'] = df['Date'].dt.month_name() df['day'] = df['Date'].dt.day df['Day_of_Week'] = df['Date'].dt.day_name() df["period"] = df["year"].astype(str) + "." + df["month_name"].astype(str) df['Time'] = df['Time'].apply(lambda x: dt.datetime.strptime(x, "%H:%M:%S")) df['hour'] = df['Time'].dt.hour #addition of new columns df['Customer Name Init'] = df['Customer Name'].str.replace('[^A-Z]', '', regex=True) #changing customer names to initials for privacy df['Customer Name'] = df['Customer Name'].str.replace(',', '', regex=True) df['unit_price'] = df['Gross Sales'] / df['Count'] # adding a price per unit of product columnCustomers without a name are understood to be cash paying customers and cannot be tracked.
# Fill in missing values df['customer_name'].fillna('NoName',inplace=True) df['customer_init'].fillna('NN',inplace=True)
#dictionary import to map items to product types and beer names import ast def read_text_file(file_path): with open(file_path, "r") as file: content = file.read() return content file_path_ names = "data/names_dict.txt" file_content_ names = read_text_file(file_path_ names) names _dict = ast.literal_eval(file_content_ names) file_path_product = "data/product_dict.txt" file_content_product = read_text_file(file_path_product) product_dict = ast.literal_eval(file_content_product)These dictionaries could then be mapped to create new columns for the beer names and the product type.
df['beer_ names’] = df['Item'].map(names_dict) df['product_type'] = df['Item'].map(product_dict)
df = pd.concat([sales_2019, sales_2020, sales_2021, sales_2022, sales_2023], join='inner', axis='rows') df = df.drop(['Payment ID', 'Customer ID', 'Customer Reference ID', 'Price Point Name', 'Discounts', 'Net Sales', 'Tax', 'Category', 'Commission', 'Employee', 'Unit', 'Dining Option', 'Details', 'Qty', 'SKU', 'Modifiers Applied','Location', 'Itemization Type', 'Fulfillment Note', 'Notes', 'Device Name', 'Event Type', 'Time Zone'], axis='columns') df = df.rename(columns={"Date":"date", "Time":"time", "Item":"item", "Gross Sales":"gross_sales", "Transaction ID":"transaction_id", "Customer Name":"customer_name", "Customer Name Init":"customer_init", "Day_of_Week":"weekday", "Count":"count"}) df = df.sort_values(by="date", ascending=True, ignore_index=True)
# addition of customer ID number df['customer_id_no'] = pd.factorize(df['customer_name'])[0] df['transaction_no'] = pd.factorize(df['transaction_id'])[0] df['period_id'] = pd.factorize(df['period'])[0]
df = df.sort_values(['customer_id_no', 'date'])
df['visit_freq'] = df.groupby('customer_id_no')['transaction_no'].transform('nunique') df = df.sort_values(['date', 'customer_id_no'])
df['daily_no_customers'] = df.groupby('date')['transaction_no'].transform('nunique')
df = df.sort_values(['date', 'beer_ names']) df['no_styles_sold'] = df.groupby('date')['beer_ names'].transform('nunique')
sales_data = pd.merge(df, weather, on='date', how='left') sales_data['precipitation'].fillna(0,inplace=True) sales_data['snowfall'].fillna(0,inplace=True)
sales_data = pd.merge(sales_data, untappd_data, on='beer_ names’, how='left')
sales_data = pd.merge(sales_data, insta_data, on='date', how='left')After the final merging, the data is ready to be used for further analysis!