import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set()
Author: Iza Romanowska
Date: 13/10/2020
Associated data: Dataset_funerary: Raja, Rubina. 2021. “Funerary data from Palmyra, Syria collated by the Palmyra Portrait Project”, version 1.0. Zenodo. Doi: 10.5281/zenodo.4669962
Associated publications: Romanowska, I. et al. in press “Reconstructing the social, economic and demographic trends of Palmyra’s elite from funerary data.” Journal of Archaeological Science: xxx.
Raja, R. et al. in press “300 years of Palmyrene history through deaths. Unlocking the potential of archaeological and historical data for studying social transformation.”
Bobou, O. et al. 2021 “Historical Trajectories of Palmyra’s elites through the lens of archaeological data.” Journal of Urban Archaeology 4: xxx.
Description: Data analysis script for three data types: portraits, tombs, burials from the city of Palmyra, Syria. Includes EDS, cumulative probability curves and simple regression.
data = pd.read_excel('SI_A_data/dataset_funerary.xlsx')
data.head()
Run every time to check data coherence and signal potential issues.
data.info()
There can be no missing values here
assert len(data['Object ID']) == len(data)
assert len(data['Type?']) == len(data)
print('Object ID duplicates:\n', data.duplicated(subset = 'Object ID').value_counts())
First, ensure all dates are in correct format. A few of the dates were entered as, e.g., 88/89. In all cases there is only 1 year difference. Only the first value was kept.
data_test = pd.to_numeric(data.date_start,errors = 'coerce')
#print(data_test)
incorrect_date_start = data.date_start[pd.isnull(data_test)]
#print(incorrect_date_start)
correct_date_start = incorrect_date_start.apply(lambda x: int(x.split('/')[0]))
#print(correct_date_start)
data.loc[correct_date_start.index, "date_start"] = correct_date_start
#print(data.loc[correct_date_start.index])
#data.loc[960:967] # double check
data_test2 = pd.to_numeric(data.date_end,errors = 'coerce')
incorrect_date_end = data.date_end[pd.isnull(data_test2)]
#print(incorrect_date_end)
correct_date_end = incorrect_date_end.apply(lambda x: int(x.split('/')[0]))
#print(correct_date_end)
data.loc[correct_date_end.index, "date_end"] = correct_date_end
#print(data.loc[correct_date_end.index])
#print(data.loc[488]) # double check
#print(data.loc[2507]) # double check
data['date_start'] = pd.to_numeric(data['date_start'], errors = 'raise')
data['date_end'] = pd.to_numeric(data['date_end'], errors = 'raise')
Second, display the range.
print('date start',data.date_start.min(), data.date_start.max() )
print('date end', data.date_end.min(), data.date_end.max())
#non-numeric elements ('unknown') are replaced with np.nan
#data.g_size.unique()
data.g_size = data.g_size.replace('unknown', np.nan)
data.g_size = data.g_size.replace('unknown ', np.nan)
data.g_size = pd.to_numeric(data.g_size)
#data.g_size.unique()
There are 117 portraits with a start and end date that are the same. This is an error since we count per year so their end dates should be the following year (e.g., a portrait placed in year 211 should have start-end dates of 211-212) - we treat it as the 1/Jan/Year. Now, the majority of these dates come from 5 cases where the year was known thanks to an inscription on, e.g., a group portrait. We'll therefore plot and check the dataset including these cases but if the pattern is qualitatively the same (the trends are identical) then we will remove these to avoid confusing spikes in the visualisation.
#data.loc[data['date_end'] == data['date_start']]['date_start'].value_counts()
print('Min capacity: ', data.capacity_min.min(), data.capacity_min.max() )
print('Max capacity: ', data.capacity_max.min(), data.capacity_max.max())
UPDATE: In case of tombs known only from written sources, the capacity was set to 1.
data.loc[data['capacity_min'] == data['capacity_max']]
len(data.loc[data['capacity_min'] == data['capacity_max']])
For particularly well preserved tombs and for those only known from written sources knowing exact capacity is possible.
assert data['Type?'].isin(['grave', 'portrait', 'bones']).any()
assert data['tomb_type'].isin(['tower', 'hypogeum', 'house']).any()
assert data['gender'].isin(['male', 'female', 'unspecified']).any()
assert data['gender'].isin(['adult', 'child', 'old_adult', 'unspecified', 'young_adult']).any()
# If any of the asserts is thrown off use this code to identify an incorrect value
categorical = ['Type?', 'tomb_type', 'gender', 'age', 'b_age']
for i in categorical:
print(i + '\n', data[i].unique())
sns.set( font_scale=1.5)
sns.catplot(data = data, x = 'Type?', kind = 'count',height=8.27, aspect = 15/8)
plt.title('Types of data')
data["Type?"].value_counts()
Calculate mean dating and add as an additional column
data['date_mean'] = data['date_start']+((data['date_end']-data['date_start']) / 2)
print('Number of dates: ', len(data.date_start.dropna()))
fig = plt.figure(figsize=(15,7))
sns.distplot(data['date_start'])
sns.distplot(data['date_end'])
#sns.distplot(data['date_mean'].dropna().astype(int))
plt.figure( figsize = (20,10))
_, bins = np.histogram(data["date_start"])
for i in data['Type?'].unique():
data_plot = data[data['Type?'] == i]
sns.distplot( data_plot["date_start"], bins=bins, label = i)
plt.legend()
plt.title("Distribution of start dates")
plt.show()
plt.close()
plt.figure( figsize = (20,10))
_, bins = np.histogram(data["date_end"])
for i in data['Type?'].unique():
data_plot = data[data['Type?'] == i]
sns.distplot( data_plot["date_end"], bins=bins, label = i)
plt.legend()
plt.title("Distribution of end dates")
def calc_prob(data, multiplier=1):
# get the oldest date, and the youngest date to calculate the range for the dictionary
minimum = data["date_start"].min()#.astype(int) # you'll need to cast into int in older versions of Pandas
maximum = data["date_end"].max()#.astype(int)
# initiate the dictionary
x = dict.fromkeys(range(minimum, maximum), 0)
# calculat the 'probability of existence' for any one year = year / range of dates
data['range'] = (1 / (data['date_end'] - data['date_start'] ))*multiplier # the multiplier is for eg. tomb capacity
# drop nans because they are problematic with recasting into integers
data = data.dropna(subset=['date_start'])
data = data.dropna(subset=['date_end'])
# for each object
for row in range(len(data)):
# for each year
for year in range(data['date_start'].astype(int).iloc[row], data['date_end'].astype(int).iloc[row]):
# batch of print statement for testing
# print("------", year)
# print(data['date_start'].astype(int).iloc[row])
# print(data['date_end'].astype(int).iloc[row])
# print(data['range'].iloc[row])
# # update that year with the probability of that site
x[year] += data['range'].iloc[row]
# print(x[year])
# recast it into a useful data structure
s = pd.Series(x, name='Probability')
s.index.name = 'Year'
s.reset_index()
return s
Note this curve is used in Figure 8 in Romanowska et al. 2021, and figure 4 in Bobou et al. 2021
probs = calc_prob(data)
plt.figure(figsize = (15,7))
g = sns.lineplot(data = probs, color ='#3182bd')
g.set_title('Probability distribution of all objects')
plt.xlim(0, 300)
This is Figure 4 in Romanowska et al. 2021
data['capacity_mean'] = data['capacity_min']+((data['capacity_max']-data['capacity_min'])/2)
probs_tombs = calc_prob(data[data['Type?']== 'grave'], data["capacity_mean"])
fig, ax = plt.subplots( figsize = (15, 8))
g = sns.lineplot(data = probs_tombs, color ='#ffa726', label = 'Tombs capacity')
plt.xlim(0, 300)
plt.xlabel('Year (CE)')
plt.tight_layout()
plt.savefig('figures/Rom_tombs_capacity_small.png', dpi = 600)
This is Figure 3 in Romanowska et al. 2021
probs_bones = calc_prob(data[data['Type?']== 'bones'])
fig, ax = plt.subplots( figsize = (15, 8))
g = sns.lineplot(data = probs_bones, color ='#3182bd', label = 'Individual graves')
plt.xlim(0, 300)
plt.xlabel('Year (CE)')
plt.tight_layout()
plt.savefig('figures/Rom_individual_graves_small.png', dpi = 600)
This is Figure 2 in Romanowska et al. 2021
probs_portr = calc_prob(data[data['Type?']== 'portrait'])
plt.figure(figsize = (15,8))
g = sns.lineplot(data = probs_portr, color ='#8bc34a', label = 'Portraits')
plt.xlim(0, 300)
plt.xlabel('Year (CE)')
plt.tight_layout()
plt.savefig('figures/Rom_prob_portraits_small.png', dpi = 600)
This is figure 1 in Romanowska et al.
probs_bones = calc_prob(data[data['Type?']== 'bones'])
probs_tombs = calc_prob(data[data['Type?']== 'grave'] )
probs_portr = calc_prob(data[data['Type?']== 'portrait'])
plt.figure(figsize = (15,8))
sns.set(font_scale=1.5)
g = sns.lineplot(data = probs_bones, color ='#3182bd', label = 'Burials')
g = sns.lineplot(data = probs_tombs, color ='#ffa726', label = 'Tombs')
g = sns.lineplot(data = probs_portr, color ='#8bc34a', label = 'Portraits')
plt.xlim(0, 300)
plt.xlabel('Year (CE)')
plt.ylabel('Cumulative probability')
plt.tight_layout()
plt.savefig('figures/Rom_prob_all_small.png', dpi = 600)
This is figure 7 in Raja et al. 2021
sns.set(style="whitegrid", font_scale=2.2)
probs_bones = calc_prob(data[data['Type?']== 'bones'])
probs_tombs = calc_prob(data[data['Type?']== 'grave'] )
probs_portr = calc_prob(data[data['Type?']== 'portrait'])
plt.figure(figsize = (30,14))
g = sns.lineplot(data = probs_bones, color ='#1979a9',linewidth=4, label = 'Bones from graves')
g = sns.lineplot(data = probs_tombs, color ='#44bcd8',linewidth=4, label = 'Tombs')
g = sns.lineplot(data = probs_portr, color ='#e07b39',linewidth=4, label = 'Portraits')
g.set_title('Probability distribution of dates for three types of funerary objects')
plt.xlim(0, 272)
plt.xlabel('Year')
plt.ylabel('Cumulative probability')
plt.tight_layout()
plt.savefig('figures/Raja_prob_all_300.png', dpi = 300)
As mentioned above, the cumulative probability algorithm couldn't deal with chronologies that were not intervals, ie., exact dates. This affects 117 objects, however, the majority of them comes from 5 dates. Here we corrected the interval and compared the results.
def calc_prob_1y(data, multiplier=1):
# get the oldest date, and the youngest date to calculate the range for the dictionary
minimum = data["date_start"].min()#.astype(int)
maximum = data["date_end_new"].max()#.astype(int)
# initiate the dictionary
x = dict.fromkeys(range(minimum, maximum), 0)
data['range'] = (1 / (data['date_end_new'] - data['date_start'] ))*multiplier
# drop nans because they are problematic with recasting into integers
data = data.dropna(subset=['date_start'])
data = data.dropna(subset=['date_end_new'])
# for each object
for row in range(len(data)):
# for each year
for year in range(data['date_start'].astype(int).iloc[row], data['date_end_new'].astype(int).iloc[row]):
# print("------", year)
# print(data['date_start'].astype(int).iloc[row])
# print(data['date_end'].astype(int).iloc[row])
# print(data['range'].iloc[row])
# # update that year with the probability of that site
x[year] += data['range'].iloc[row]
# print(x[year])
# recast it into a useful data structure
s = pd.Series(x, name='Probability')
s.index.name = 'Year'
s.reset_index()
return s
data["date_end_new"] = data.apply(lambda x : x['date_end'] + 1 if x['date_start'] == x['date_end'] else x['date_end'], axis=1).astype(int)
probs_portr1 = calc_prob_1y(data[data['Type?']== 'portrait'])
probs_portr2 = calc_prob(data[data['Type?']== 'portrait'])
plt.figure(figsize = (15,8))
sns.set()
g = sns.lineplot(data = probs_portr1, color ='#ffa726', label = 'Portraits_all')
g = sns.lineplot(data = probs_portr2, color ='#8bc34a', label = 'Portraits')
plt.xlim(-50, 300)
plt.xlabel('Year')
plt.tight_layout()
There is no significant difference in trends. The 5 spikes of clusters of portraits datad with an inscription are clearly visible. 4 out of 5 are in 'the peaks' and one is in a 'slow growth' phase so the general conclusions are strengthed. The visualisation without the spikes is easier to interpret but in the future a solution should be found for exactly dated objects (perhaps binning them within one decade).
Each date is associated with a certenty value. The exact criteria are described in the metadata.
plt.figure(figsize=(15, 8))
sns.set_palette(sns.cubehelix_palette(start=.5, rot=-.75, reverse = True))
g = sns.countplot(data['dateS_certainty'])
g.set_title('Certainty - Start Dates')
plt.show()
plt.close()
plt.figure(figsize=(15, 8))
g = sns.countplot(data['dateE_certainty'])
g.set_title('Certainty - End Dates')
plt.show()
ax = (data.groupby([ 'Type?', 'dateS_certainty'])
.size()
.unstack() # crt the data frame
.T # transpose
.apply(lambda x: x / x.sum()) # calculate frequencies per evidence
.T
.plot(kind='bar',
stacked=True,
figsize = (15,8)))
ax.set_yticklabels([str(x) +'%' for x in range(0,101,20)])
ax = (data.groupby([ 'Type?', 'dateE_certainty'])
.size()
.unstack() # crt the data frame
.T # transpose
.apply(lambda x: x / x.sum()) # calculate frequencies per evidence
.T
.plot(kind='bar',
stacked=True,
figsize = (15,8)))
ax.set_yticklabels([str(x) +'%' for x in range(0,101,20)])
We will replot the cumulative probabilty curves but scale the dates by their certainty. That is, dates that are most certain will be given four times more weight that uncertain dates.
probs = calc_prob(data)
probsS = calc_prob(data, data['dateS_certainty'])
probsE = calc_prob(data, data['dateE_certainty'])
plt.figure(figsize = (15,7))
g = sns.lineplot(data = probsS, label = "Start date uncertainty")
g = sns.lineplot(data = probsE, label = "End date uncertainty")
g = sns.lineplot(data = probs, color ='#3182bd', label = "without chronological uncertainty")
g.set_title('Probability distribution of all objects scaled by uncertainty')
plt.xlim(0, 300)
The vast majority of chronological uncertainty values are 2 so overall it makes no difference to the trends detected. The uncertainty graphs are higher because we mutliplied orginal prabability by the value of uncertainty.
sns.set()
g = sns.catplot(data = data, x = 'tomb_type', kind = 'count',height=8.27, aspect = 15/8)
g.fig.suptitle('Tomb types', y = 1.03)
fig, ax = plt.subplots(figsize = (15, 8))
data['capacity_mean'] = data['capacity_min']+((data['capacity_max']-data['capacity_min'])/2)
sns.regplot(data['date_start'], data['capacity_mean'], marker = '_', order = 2)
data_cc = data.rename({'capacity_certainty': ' '}, axis = "columns") # this is the quickest way to get rid of the legend title
sns.scatterplot('date_start', 'capacity_mean', hue =' ', data = data_cc, palette= sns.color_palette("Blues", 6))
plt.xlabel('Years (CE)')
plt.ylabel('Mean capacity (number of individual burials)')
plt.xlim(-50, 300)
plt.ylim(-10, None)
plt.legend()
plt.tight_layout()
#plt.savefig('figures/capacity over time.png', dpi = 300)
It looks like the capacity is slightly growing over time but after 150 CE it becomes much more uncertain. Note we're fitting second order here to give it even a chance to change directions. Let's look at the regression values for first and second order.
data_reg = data.dropna(subset=['date_start','capacity_mean'])
np.polyfit(data_reg['date_start'], data_reg['capacity_mean'], 1, full = True)
data_reg = data.dropna(subset=['date_start','capacity_mean'])
np.polyfit(data_reg['date_start'], data_reg['capacity_mean'], 2, full = True)
Tombs where no archaeological remains were found were given capacity == 1. They may be dragging the data down. We replace capacity == 1 with an average capacity of the full series. The general pattern of average capacity remaining the same holds. This is figure 5 right in Romanowska et al. 2021
data["capacity_mean_av"] = data["capacity_mean"].replace(1, np.mean(data["capacity_mean"]))
fig, ax = plt.subplots(figsize = (15, 10))
#We plot two reglines for low and moderate/high capacity certainty
data_cert_low = data[data["capacity_certainty"]<=2]
sns.regplot(data_cert_low['date_start'], data_cert_low['capacity_mean_av'], label = 'Low certainty', ax=ax, marker = '_', order = 2)
data_cert = data[data["capacity_certainty"]>2]
sns.regplot(data_cert['date_start'], data_cert['capacity_mean_av'], label = 'High certainty', color = 'navy',ax=ax, marker = '_', order = 2)
data_cc = data.rename({'capacity_certainty': ' '}, axis = "columns") # this is the quickest way to get rid of the legend title
# the scatter has all data points
sns.scatterplot('date_start', 'capacity_mean_av', hue =' ', data = data_cc, ax=ax,palette= sns.color_palette("Blues", 6))
plt.xlabel('Time (years)')
plt.ylabel('Mean capacity (number of individual burials)')
plt.xlim(0, 260)
plt.ylim(-10, None)
plt.legend()
# Insert
left, bottom, width, height = [0.11, 0.618, 0.35, 0.35]
ax2 = fig.add_axes([left, bottom, width, height])
resampled_tomb_data = data[data["Type?"]== "grave"][["date_start", "capacity_mean"]]
# the datetime can only take dates between around 1600 to 2400 so we'll pretend this is all happening a bit later
resampled_tomb_data.date_start = resampled_tomb_data.date_start + 1800
resampled_tomb_data.date_start = pd.to_datetime(resampled_tomb_data.date_start, format='%Y', errors = "coerce")
resampled_tomb_data =resampled_tomb_data.set_index("date_start")
resampled_tomb_data.resample("20A").mean().plot(ax=ax2)
ax2.set_xticklabels(["-80", "-30", "20", "70","120", "170", "220"]) # and then we put the correct labels back
ax2.set_xlabel("")
plt.tight_layout()
plt.savefig('figures/Rom_capacity over time_small.png', dpi = 600)
The same but in numbers
print('mean capacity of poorly preserved tombs: ', data_cert_low['capacity_mean'].mean())
print('mean capacity of better preserved tombs: ', data_cert['capacity_mean'].mean())
data.groupby('capacity_certainty')['capacity_mean'].mean()
Hypogea were in general larger but each type had outliers which have much higher capacity. This is very likely a function of preservation.
#print(data[data['Type?']== 'grave'].head())
g = sns.catplot(data = data,
x = 'tomb_type',
y = 'capacity_mean',
kind = 'box',
height=8.27, aspect = 15/8)
g.fig.suptitle('Mean tomb capacity for different types', y = 1.03)
plt.figure(figsize = (15, 8))
g = sns.distplot(data['g_size'].dropna())
plt.xlim (0,None)
g.set_title('Distribution of tomb sizes')
This is figure 5 left in Romanowska et al. 2021
plt.figure(figsize=(15,10))
sns.regplot('date_start', 'g_size', data = data, color = 'salmon')
sns.scatterplot('date_start', 'g_size', hue = 'capacity_certainty', data = data)
plt.xlabel('Year (CE)')
plt.ylabel('Tomb size (m2)')
plt.xlim(0, 270)
plt.ylim(-10, None)
plt.legend()
plt.tight_layout()
plt.savefig('figures/Rom_size_over_time_small.png', dpi = 600)
#print(data[data.g_size.notnull()])
g = sns.catplot(data = data,
x = 'tomb_type',
y = 'g_size',
kind = 'box', height=8.27, aspect = 15/8)
g.fig.suptitle('Tomb size for different types', y = 1.03)
data_tombs = data[data["Type?"]=='grave']
data_tombs= data_tombs[data_tombs['tomb_type']!= 'unspecified']
This is figure 6 in Romanowska et al. 2021
pal = (sns.cubehelix_palette(3))
fig, ax = plt.subplots(2, 1, figsize=(15, 10), sharex = True, gridspec_kw={'height_ratios': [3, 1], 'hspace':0})
counter = 0
for i in data_tombs['tomb_type'].unique():
data_plot = data_tombs[data_tombs['tomb_type'] == i]
sns.distplot(data_plot["date_start"], bins = bins, label = i, hist=False, ax = ax[0], color = pal[counter])
sns.stripplot(data_plot["date_start"],color = pal[counter], ax = ax[1], jitter = True)
counter+=1
ax[0].set_xlabel('')
ax[0].set_ylabel('PDF')
ax[1].set_xlabel('Year (CE)')
ax[1].spines["top"].set_edgecolor('grey')
plt.xlim(-100, 300)
plt.tight_layout()
plt.savefig('figures/Rom_PDF_tomb_types_small.png', dpi = 600)
This includes both portraits and burials.
sns.set_palette(sns.cubehelix_palette(start=.5, rot=-.75, reverse = True))
sns.set(font_scale = 1.5)
plt.figure(figsize=(15,8))
sns.catplot(data = data, x = 'gender', kind = 'count', height=8.27, aspect = 15/8)
data.gender.value_counts()
age_order = ['child', 'young_adult','adult', 'old_adult'] # ,'unspecified'] # add this to include 'unspecified'
g = sns.catplot(data = data,
x = 'age',
kind = 'count',
order = age_order,
height=8.27, aspect = 15/8)
data.age.value_counts()
plot_data = data.loc[(data['age'] != 'unspecified')
& (data['gender'] != 'unspecified')]
sns.catplot(data = plot_data,
x = 'age',
kind = 'count',
hue = 'gender',
order = age_order, height=8.27, aspect = 15/8)
probs_m = calc_prob(data[data['gender']== 'male'])
probs_f = calc_prob(data[data['gender']== 'female'])
plt.figure(figsize = (14,6))
sns.set()
g = sns.lineplot(data = probs_m, color ='#3182bd', label = 'male')
g = sns.lineplot(data = probs_f, color ='#ffa726', label = 'female')
plt.xlim(0,None)
This is figure 7 in Raja et al 2021
probs_m = calc_prob(data[data['gender']== 'male'])
probs_f = calc_prob(data[data['gender']== 'female'] )
probs_ch = calc_prob(data[data['age']== 'child']) # note there is a typo, white space after "child"
sns.set(style="whitegrid", font_scale=2.2)
plt.figure(figsize = (30,14))
g = sns.lineplot(data = probs_m, color ='#3182bd',linewidth = 4, label = 'Male')
g = sns.lineplot(data = probs_f, color ='#BF3D31',linewidth = 4, label = 'Female')
g = sns.lineplot(data = probs_ch, color ='#BF9B31',linewidth = 4, label = 'Child')
g.set_title('Probability distribution of portraits depicting men, women and children')
plt.xlim(0, 275)
plt.xlabel('Year')
plt.tight_layout()
plt.legend(loc=2)
plt.savefig('figures/Raja_prob_genders300.png', dpi = 300)
plt.figure(figsize = (14,6))
list_probs = []
sns.set()
for i in age_order:
probs = calc_prob(data[data['age']== i])
list_probs.append(probs)
sns.lineplot(data = list_probs[0], label = age_order[0])
sns.lineplot(data = list_probs[1], label = age_order[1])
sns.lineplot(data = list_probs[2], label = age_order[2])
sns.lineplot(data = list_probs[3], label = age_order[3])
plt.xlim(0,None)
#sns.lineplot(data = list_probs[4], label = age_order[4])
#g = sns.lineplot(data = probs_f, color ='#ffa726', label = 'female')