%% (Internal) Cluster data with expectation-maximization algorithm % % [clust_labels w_Trained_Classifier] = cluster_data_with_EM_clust(dsTrain, w_mapp, CantClusters, iter_times) % % Arguments: % % + dsTrain: PRdataset with data % % + w_mapp: mapping to perform clustering. % % + CantClusters: clusters to discover. % % + iter_times: iterations to perform clustering. % % Output: % % + clust_labels: Cluster pertenence labels % % + w_Trained_Classifier: Mapping that perform the clustering found. % % Example: % % warning off all; prwarning(0); % Clust_Labels = cluster_data_with_EM_clust(featMat_clust(pending_hb_idx,:), qdc_new([],1e-6,1e-6, []), CantClusters, iter_times); % warning on all; prwarning(1); % % See also a2hbc_main % % Author: Mariano Llamedo Soria llamedom@electron.frba.utn.edu.ar % Version: 0.1 beta % Last update: 14/5/2014 % Birthdate : 21/4/2015 % Copyright 2008-2015 % function [clust_labels w_Trained_Classifier] = cluster_data_with_EM_clust(dsTrain, w_mapp, CantClusters, iter_times) if nargin < 4 || isempty(iter_times) iter_times = 1; end if nargin < 3 || isempty(CantClusters) CantClusters = 5; end m = getsize(dsTrain,1); std_train = std(+dsTrain); dsTrain = setdata(dsTrain, bsxfun(@rdivide, +dsTrain, std_train )); clustered_labels = repmat('0', m, iter_times); jj = 1; % for CantClusters = 2:5 for jj = 1:iter_times %EMclust ii = 0; bContinuar = true; while(bContinuar && ii < CantClusters) try % dbclear if caught error % [clust_labels w_Trained_Classifier ] = emclust_new(dsTrain, w_mapp, CantClusters-ii ); clust_labels = []; [clust_labels w_Trained_Classifier ] = emclust(dsTrain, w_mapp, CantClusters-ii ); % dbstop if caught error bContinuar = false; catch ME if(strcmpi(ME.message, 'Not possible to find desired number of components')) ii = ii + 1; else rethrow(ME) end end end if( isempty(clust_labels) ) error('Clustering failed, check data.') else clustered_labels(:,jj) = char(97+clust_labels); end % jj = jj + 1; end bClustered = true; while(bClustered) %analizo las coincidencias en las distintas iteraciones respecto a %los etiquetados. [~, sort_idx] = sort( cellstr(clustered_labels) ); [all_clusters, aux_location] = unique(clustered_labels(sort_idx,:), 'rows', 'first'); aux_location = [colvec(aux_location); m+1]; cluster_sizes = diff(aux_location); [~, clust_sort_idx] = sort(cluster_sizes, 'descend'); %agrupo todos los subclusters a una dist maxima max_distance max_distance = round(0.2*iter_times); %distancia maxima para considerarse un cluster cant_clusters = size(all_clusters,1); cant_iter = size(all_clusters,2); aux_1 = repmat( all_clusters, cant_clusters, 1); aux_idx = colvec(repmat(1:cant_clusters,cant_clusters,1)); aux_2 = all_clusters(aux_idx,:); distances = reshape( sum(aux_1 ~= aux_2,2), cant_clusters, cant_clusters ); remaining_clusters = 1:cant_clusters; %clusterizo igual todo los clusters iguales hasta max_distance for ii = rowvec(clust_sort_idx) bClustered = false; cluster2fusion_idx = find(distances(:,ii) <= max_distance); cluster2fusion_idx = cluster2fusion_idx(cluster2fusion_idx < ii | cluster2fusion_idx > ii ); for jj = 1:length(cluster2fusion_idx) aux_idx = find(strcmpi(cellstr(clustered_labels), cellstr(all_clusters(cluster2fusion_idx(jj),:)))); if( ~isempty(aux_idx) ) bClustered = true; remaining_clusters( remaining_clusters == cluster2fusion_idx(jj)) = []; clustered_labels( aux_idx ,:) = repmat(all_clusters(ii,:), length(aux_idx), 1 ); all_clusters(cluster2fusion_idx(jj),:) = all_clusters(ii,:); end end if(bClustered) remaining_clusters( remaining_clusters == ii ) = []; %fuerzo el recalclo de distancias, para que no se junte todo y solo %los clusters grandes se coman a los mas chicos. break end end end % %luego clusterizo todo lo que fue quedando a distancias mayores, al estilo % %clustering jerarquico. % kk = max_distance+1; % while( kk <= iter_times) % for ii = remaining_clusters % bClustered = false; % cluster2fusion_idx = find(distances(:,ii) == kk ); % cluster2fusion_idx = cluster2fusion_idx(cluster2fusion_idx > ii ); % cluster2fusion_idx = intersect(cluster2fusion_idx, remaining_clusters); % % for jj = 1:length(cluster2fusion_idx) % aux_idx = find(strcmpi(cellstr(clustered_labels), cellstr(all_clusters(cluster2fusion_idx(jj),:)))); % if( ~isempty(aux_idx) ) % bClustered = true; % remaining_clusters( remaining_clusters == cluster2fusion_idx(jj)) = []; % clustered_labels( aux_idx ,:) = repmat(all_clusters(ii,:), length(aux_idx), 1 ); % end % end % if(bClustered) % remaining_clusters( remaining_clusters == ii ) = []; % end % end % kk = kk + 1; % end %REanalizo las coincidencias en las distintas iteraciones respecto a %los etiquetados. [~, sort_idx] = sort( cellstr(clustered_labels) ); [all_clusters, aux_location] = unique(clustered_labels(sort_idx,:), 'rows', 'first'); aux_location = [colvec(aux_location); m+1]; cluster_sizes = diff(aux_location); % esto es por si interesaria filtrar clusters chicos, no parece buena idea % porque agrupa juntas las clases que aparecen ocacionalmente, haciendolas % indetectables. % % big_clusters_start_idx = find(cluster_sizes > 50); % big_clusters_range_idx = [ colvec(aux_location( big_clusters_start_idx )) colvec(aux_location( big_clusters_start_idx+1 )-1)]; % % big_clusters_idx = []; % group_labels = []; % cant_big_clusters = length(big_clusters_start_idx); % for ii = 1:cant_big_clusters % big_clusters_idx = [ big_clusters_idx; colvec(big_clusters_range_idx(ii,1):big_clusters_range_idx(ii,2))]; % group_labels = [group_labels ; repmat(ii, cluster_sizes(big_clusters_start_idx(ii)) ,1)]; % end % % clust_labels = repmat(cant_big_clusters+1,m,1); % cl ust_labels(sort_idx(big_clusters_idx)) = group_labels; cant_clusters = size(all_clusters,1); clust_labels = nan(m,1); for ii = 1:cant_clusters clust_labels(sort_idx(aux_location(ii):(aux_location(ii+1)-1))) = ii; end if(any(isnan(clust_labels))) error() end