%% Figure: Number of authors per manuscript

data = importdata('../Data/authorperpaper.txt');
range(data.data)
max(data.data)
median(data.data)
mean(data.data)
std(data.data)
quantile(data.data, [.1 .9])

h = hist(data.data, 1:15);

% plot
clf
set(gcf, 'Pos', [731 316 528 276])
bar(1:15, h, 'edgecolor', 'none', 'facecolor', [0.89 0.45 0.13])
bar(1:15, 100*h/sum(h), 'edgecolor', 'none', 'facecolor', [0.89 0.45 0.13])
%xlim([categorical(1) categorical(15)])
xlabel('Number co-authors per paper'), ylabel('Frequency')
box off




%% Figure: Time authors work in the field

data = importdata('../Data/authorstime.txt', ';');


range(data.data)
max(data.data)
median(data.data)
mean(data.data)
std(data.data)
quantile(data.data, [.25 .75])

[h bins] = hist(data.data, 1:40); h = h / sum(h);
hc = fliplr(cumsum(fliplr(h)));


% fit exponential model
expModel = fittype('a * exp(b * x)', 'independent', 'x', 'coefficients', {'a', 'b'});
initialGuesses = [1, -0.1]; % initial guesses for parameter values

% fit the model to the data
fitResult = fit(bins(6:end)', h(6:end)', expModel, 'StartPoint', initialGuesses);


% plot
clf
y_values = fitResult.a * exp(fitResult.b * [1:30]);


semilogy(1:30, y_values, 'Color', [0.89 0.45 0.13], 'LineWidth', 2)
hold on
set(gcf, 'Pos', [731 316 528 276])
semilogy(bins, h, '.', 'Color', [0. 0.62 0.85], 'MarkerSize', 15)
grid on
xlabel('Years publishing in the field'), ylabel('Fraction of authors')
box off






%% Figure: Publications per authors

data = importdata('../Data/authorspublications.txt', ';');

max(data.data)
median(data.data)
mean(data.data)
std(data.data)
quantile(data.data, [.25 .75])

[h bins] = hist(data.data(:,2), 0:max(data.data(:,2))); h = h / sum(h);
h(2) % published only once

[h1 bins] = hist(data.data(:,1), 0:max(data.data(:,2))); h1 = h1 / sum(h1);
1-h1(1) % fraction of 1st authors

bar(bins,[h;h1])

%loglog(bins, [h; h1], '.', 'Color', [0. 0.62 0.85], 'MarkerSize', 15)






%% Figure: Number of new authors per year

% Get the current date and time
currentDateTime = datetime('now');

minYear = 1987;
maxYear = currentDateTime.Year;

% import paper from BibTeX file
bibdata = readBibTeX('../rp.bib');
% all labels
labels = fieldnames(bibdata);


yearList = minYear:maxYear;
for i = 1:length(yearList)
    yearAuthors{i} = "";
end
numPapers = zeros(length(yearList), 1);

% collect authors in each year
for i = 1:length(labels)

    paper = bibdata.(labels{i});
    year = bibdata.(labels{i}).year;
    if strcmpi(year, 'in press')
        year = '2024';
    end
    
    % skip if paper is software or related
    if strcmpi(paper.annote, 'software') | strcmpi(paper.annote, 'related') | str2num(year) < minYear
       continue
    end

    % list of authors in current paper
    paperAuth = string(strsplit(paper.author, ' and '));
    
    % current year as index
    [~, idxYear] = ismember(str2num(year), yearList);

    numPapers(idxYear) = numPapers(idxYear)+1;

    % get already stored authors
    authList = yearAuthors{idxYear};

    % check if authList is empty and assign paperAuth
    if isempty(authList{1})
       authList = paperAuth;
    % else check if each individual paper author is already included
    % (if not, include author)
    else
        for j = 1:length(paperAuth)
           if ~ismember(paperAuth{j}, authList)
              authList(end + 1) = paperAuth{j};
           end
        end
    end

     yearAuthors{idxYear} = authList;

end


% count new authors
authorList = string.empty;
numNewAuthors = zeros(length(yearList), 1);
numAllAuthors = zeros(length(yearList), 1);
for idxYear = 1:length(yearList)
   if isempty(yearAuthors{idxYear}{1})
      continue
   end
   [cnt loc] = ismember(yearAuthors{idxYear}, authorList);
   authorList = [authorList, yearAuthors{idxYear}(find(1-cnt))];
   numNewAuthors(idxYear) = sum(1-cnt);
   numAllAuthors(idxYear) = length(yearAuthors{idxYear});
end


% plot number of new authors per year
clf
set(gcf, 'Pos', [731 316 528 276])
h = plotyy(datetime(yearList,1,1), [numNewAuthors, numAllAuthors], datetime(yearList,1,1), numNewAuthors ./ numAllAuthors);
grid on
set(h(1).Children(1), 'Color', [0. 0.62 0.85], 'LineWidth', 2)
set(h(1).Children(2), 'Color', [0.89 0.45 0.13], 'LineWidth', 2)
set(h(2).Children, 'Color', [.5 .5 .5], 'LineWidth', 2, 'LineStyle', '--')
set(h(1), 'YColor', [0. 0.62 0.85])
set(h(2), 'YColor', [.5 .5 .5])
set(h(1), 'XTickLabelRotation', 90)
set(h(1), 'XTick', datetime(yearList,1,1))
set(h(1), 'YTick', 0:200:2000)
set(h(2), 'YTick', 0:.2:1)

hold on
area(datetime(yearList,1,1), numAllAuthors, 'edgecolor', 'none', 'facecolor', [0. 0.62 0.85])
area(datetime(yearList,1,1), numNewAuthors, 'edgecolor', 'none', 'facecolor', [0.89 0.45 0.13])

xlim(h(1),[datetime(1987,1,1) datetime(2023,1,1)])
xlim(h(2),[datetime(1987,1,1) datetime(2023,1,1)])
xlabel('Publication year')
ylabel(h(1), 'Number of authors')
ylabel(h(2), 'Fraction of new authors')
box off




%% Figure: Growing community

% fit powerlaw
sizeCommunity = cumsum(numNewAuthors);
p = polyfit(log10(6:length(sizeCommunity)), log10(sizeCommunity(6:end)), 1)


clf
loglog(6:length(sizeCommunity), 10.^polyval(p, log10(6:length(sizeCommunity))), 'Color', [0.89 0.45 0.13], 'LineWidth', 2)
hold on

loglog(1:length(sizeCommunity),sizeCommunity, '.', 'color', [0. 0.62 0.85], 'MarkerSize', 15, 'LineWidth', 2)
grid on
xlabel('Publication year since 1987-1')
ylabel('Total number of authors since 1987')
set(gca,'XTick', [1 2 3 4 5 6 7 8 9 10 20 30 40], 'Xlim', [1 40])
set(gca, 'XTickLabelRotation', 0)
box off



% exponential growth

% fit exponential growth model to the data for 3 intervals
yearsIntervals = {[1987:1997];
                  [1997:2009];
                  [2009:2023]};

for i = 1:length(yearsIntervals)
    p(i,:) = polyfit(yearsIntervals{i}', log(sizeCommunity(yearsIntervals{i}-1986)'), 1);
end



% plot
clf
set(gcf, 'Pos', [731 316 528 276])

for i = 1:length(yearsIntervals)
    h(i) = semilogy(yearsIntervals{i}, exp(polyval(p(i,:), yearsIntervals{i})), 'LineWidth', 3, 'Color', [0.89 0.45 0.13])
    hold on
    text(mean(yearsIntervals{i}), exp(polyval(p(i,:), mean(yearsIntervals{i}))), sprintf('~e^{%1.3f t}',p(i,1)))
end

semilogy(1986+(1:length(sizeCommunity)),sizeCommunity, '.', 'Color', [0. 0.62 0.85], 'MarkerSize', 15)

xlabel('Publication year'), ylabel('Total number of authors')
set(gca, 'XTickLabelRotation', 90, 'Xlim', [1987 2023], 'XTick', [1987:2023])

grid on
box off






%% Figure: Co-author network communities
data = readtable('../Data/coauthor_network.csv'); % data exported from Gephi

m = data.modularity_class; % community indices
max(m) % largest community index

[h bins] = hist(m, 0:max(m));


communities = [bins(:) 100*h(:)/sum(h) h(:)];
sortedCommunities = sortrows(communities, 2, 'desc');

for i = 1:length(sortedCommunities) % start with 2 to skip community without connected co-authors
    if sortedCommunities(i,2) < 1
    %if i > 18
       break
    end
    m = sortedCommunities(i,1);
    filteredData = sortrows(data(data.modularity_class == m, :), 'v_numberworks', 'desc');
    
    disp(sprintf('%i\t&%s\t&%i\t&%3.1f\\\\', m, char(join(string(filteredData.Label(1:3)),'; ')), sortedCommunities(i,3) , sortedCommunities(i,2) ))
end


