function [insertMask, returnedKeys] = upsert(conn,tableName,fieldNames,keyFields,data, varargin) % UPSERT inserts new and updates old data to a database table % % UPSERT(CONNECT,TABLENAME,FIELDNAMES,KEYFIELDS,DATA). % CONNECT is a database connection object. TABLENAME is the database % table. FIELDNAMES is a string array of database column names. KEYFIELDS % is the list of primary key fields that must be matched to perform an % UPDATE rather than an INSERT. It may be given as a logical (or 0s, 1s) % array the same length as FIELDNAMES, or a string or cell array of % strings of key column names (in which case KEYFIELDS must be a subset % of FIELDNAMES). DATA is a MATLAB cell array. % % INSERTEDMASK = UPSERT(...) returns a logical vector with one element for % each row of DATA, indicating whether the "upsert" operation meant that % corresponding row of DATA was inserted (TRUE) or merely updated (FALSE). % % UPSERT(...,'dateFields',DATEFIELDS) allows a DATE type field to be used % as one of the primary key fields. DATEFIELDS is specified equivalently to % KEYFIELDS. Each primary key DATE type field's data MUST be given as an % ANSI string literal (i.e., '1998-12-25'), rather than a MATLAB datenum % number or a differently formatted date string. % (see http://docs.oracle.com/cd/E11882_01/server.112/e26088/sql_elements003.htm#SQLRF51062) % % UPSERT(...,'updateFcn',FUNCTION_HANDLE) % UPSERT(...,'insertFcn',FUNCTION_HANDLE) optionally allows replacement % functions for the default use of MATLAB's "update" and "fastinsert". % % UPSERT(...,'debug',true) prints out diagnostic information. % % Example: % % Imagine a database table "PHONE_NOS" with data like: % PERSONID | TYPE | NUMBER % 1 'HOME' 1234567 % 1 'MOB' 1222222 % 2 'HOME' 9888888 % % Then the MATLAB commands: % newNos = {1 'MOB' 4444444 % 2 'MOB' 5555555}; % INS = upsert(conn, 'PHONE_NOS', {'PERSONID','TYPE','NUMBER'}, [1 1 0], newNos) % % Would result in the table having contents: % PERSONID | TYPE | NUMBER % 1 'HOME' 1234567 % 1 'MOB' 4444444 % 2 'HOME' 9888888 % 2 'MOB' 5555555 % % The returned variable (INS) would be [0; 1], meaning the second row was % updated, the first row was inserted. % Author: Sven Holcombe, 2015-09-01 % Handle user configuration input IP = inputParser; IP.addParameter('updateFcn', @update, @(x)isa(x,'function_handle')); IP.addParameter('insertFcn', @fastinsert, @(x)isa(x,'function_handle')); IP.addParameter('dateFields', []); IP.addParameter('returnKeys',false,@(x)iscellstr(x)||(isscalar(x)&&nnz(x==[0 1])==1)); IP.addParameter('debug', false); IP.parse(varargin{:}); doPrint = IP.Results.debug; updateFcn = IP.Results.updateFcn; insertFcn = IP.Results.insertFcn; doReturnKeys = iscellstr(IP.Results.returnKeys) || IP.Results.returnKeys; if ~doReturnKeys && nargout>1 warning('upsert:noKeys', 'Second output (returned keys) will be empty because ''returnKeys'' option was not set') end % Firstly, handle large data sets. Later we will use an IN clause with a % comma-separated list to check key fields. Oracle has a limit of 1000 % items in a list, so let's process 1000 at a time at most. Not an optimal % solution but best solutions will differ by database flavour so this is % adequate for the moment. numRows = size(data,1); returnedKeys = zeros(numRows,0); if numRows>1000 insertMask = true(numRows,1); chunks = unique([1:999:numRows numRows+1]); for i = 1:length(chunks)-1 dataInds = chunks(i):chunks(i+1)-1; [insertMask(dataInds), rk] = ... upsert(conn,tableName,fieldNames,keyFields,data(dataInds,:), varargin{:}); if doReturnKeys if i==1 returnedKeys = zeros(numRows,size(rk,2)); end returnedKeys(dataInds,:) = rk; end end return; end % keyFields may be input as: % - A string: 'id' % - A cellstr: {'id','groupNo'} % - A logical mask the same size as fieldNames (true where field is a key) % - Indices into fieldNames of the key fields % keyFields will be transformed to the indices representation below keyFields = convertSubsetOfFieldsToIndices(keyFields, fieldNames); % Get a numeric array of which fields are DATE types for comparison dateFields = convertSubsetOfFieldsToIndices(IP.Results.dateFields, fieldNames); if isempty(data) insertMask = true(0,1); return; end % Currently it's easier (if perhaps slightly slower) to treat data as a % cell regardless of what format it was provided as. if isnumeric(data) data = num2cell(data); end % Which fields are keyFields? Build lists of them for an SQL fetch keyFieldsCell = fieldNames(keyFields); keyFieldsIsnumeric = cellfun(@(x)isnumeric(x)||islogical(x), data(1,keyFields)); keyFieldsIsdatestr = ismember(keyFields, dateFields); keyFieldsListStr = sprintf('%s,',keyFieldsCell{:}); % We don't want to gather the whole table. Only the rows matching the % primary key fields. This is most generalised by building IN () lists from % a single database query, rather than sending/recieving one query for % every row of "data" being upserted. inClauses = cell(length(keyFields),1); for i=1:length(keyFields) if keyFieldsIsnumeric(i) inSet = unique([data{:,keyFields(i)}]); if all(inSet==round(inSet)) inStr = sprintf('%d,',inSet); else inStr = sprintf('%g,',inSet); end elseif ischar(data{1,keyFields(i)}) inSet = unique(data(:,keyFields(i))); if keyFieldsIsdatestr(i) inStr = sprintf('date ''%s'',',inSet{:}); else inStr = sprintf('''%s'',',inSet{:}); end else error('upsert:badKey', 'Primary key field cannot contain %s data',class(data{1,keyFields(i)})) end if length(inSet)>1 inClauses{i} = sprintf('%s IN (%s)', keyFieldsCell{i}, inStr(1:end-1)); else inClauses{i} = sprintf('%s = %s', keyFieldsCell{i}, inStr(1:end-1)); end end % Fetch all table rows potentially matching the data we want to upsert fetchWhereClause = sprintf(' %s AND', inClauses{:}); fetchSqlStr = sprintf('SELECT %s FROM %s WHERE %s', keyFieldsListStr(1:end-1), tableName, fetchWhereClause(1:end-3)); if doPrint, fprintf('Fetching %s data in %s matching given data...', keyFieldsListStr(1:end-1), tableName), end fetchedData = fetch(conn, fetchSqlStr); if doPrint, fprintf(' done. (%d potential matches found)\n', size(fetchedData,1)), end % Build a map of which rows to be upserted already exist in the table. insertMask = true(size(data,1),1); % One if ~isempty(fetchedData) eqMap = false(size(data,1), size(fetchedData,1), length(keyFields)); for i = 1:length(keyFields) if keyFieldsIsnumeric(i) thisUpsertData = cell2mat(data(:,keyFields(i))); thisFetchedData = cast(cell2mat(fetchedData(:,i)), 'like',thisUpsertData); eqMap(:,:,i) = bsxfun(@eq, thisUpsertData, thisFetchedData'); elseif keyFieldsIsdatestr(i) thisUpsertData = datenum(data(:,keyFields(i))); thisFetchedData = datenum(fetchedData(:,i)); eqMap(:,:,i) = bsxfun(@eq, thisUpsertData, thisFetchedData'); else thisUpsertData = data(:,keyFields(i)); thisFetchedData = fetchedData(:,i)'; eqCell = cellfun(@(x)strcmp(x, thisFetchedData), thisUpsertData, 'Un',0); eqMap(:,:,i) = cat(1, eqCell{:}); end end pkeysMatchMap = all(eqMap,3); insertMask = ~any(pkeysMatchMap,2); end % First find any data rows that do NOT yet exist in table. Insert them. if any(insertMask) if doPrint, fprintf('Inserting %d data rows not currently in %s...', nnz(insertMask), tableName), end if doReturnKeys insertedRK = insertFcn(conn,tableName,fieldNames,data(insertMask,:),'returnKeys',IP.Results.returnKeys); else insertFcn(conn,tableName,fieldNames,data(insertMask,:)); end if doPrint, fprintf(' done.\n'), end else insertedRK = []; end % Next, update ALL rows to the values given in data. First build WHERE. whereEqClauses = cell(numRows, length(keyFields)); for i=1:length(keyFields) if keyFieldsIsnumeric(i) whereEqClauses(:,i) = cellfun(@(dat)sprintf('%s = %g', keyFieldsCell{i}, dat), data(:,keyFields(i)),'Un',0); elseif keyFieldsIsdatestr(i) whereEqClauses(:,i) = cellfun(@(dat)sprintf('%s = date ''%s''', keyFieldsCell{i}, dat), data(:,keyFields(i)),'Un',0); else whereEqClauses(:,i) = cellfun(@(dat)sprintf('%s = ''%s''', keyFieldsCell{i}, dat), data(:,keyFields(i)),'Un',0); end end dataWhereClauses = cellfun(@(strs)sprintf(' %s AND',strs{:}), num2cell(whereEqClauses,2),'Un',0); dataWhereClauses = cellfun(@(str)['WHERE ' str(1:end-3)], dataWhereClauses, 'Un',0); % Next, run the update on all the NON-keyField fields (since the key fields % themselves are being matched, so won't change). Note that the "update" % function can be replaced by a user's modified update function. otherFields = setdiff(1:length(fieldNames), keyFields); if doPrint, fprintf('Updating %d data rows (%d new, %d old) in %s...', length(insertMask), nnz(insertMask), nnz(~insertMask), tableName), end if doReturnKeys updatedRK = updateFcn(conn,tableName,fieldNames(otherFields),data(~insertMask,otherFields), dataWhereClauses(~insertMask),'returnKeys',IP.Results.returnKeys); returnedKeys = zeros(size(cat(1,insertedRK, updatedRK))); if any(insertMask) returnedKeys(insertMask,:) = insertedRK; end if any(~insertMask) returnedKeys(~insertMask,:) = updatedRK; end else updateFcn(conn,tableName,fieldNames(otherFields),data(~insertMask,otherFields), dataWhereClauses(~insertMask)); end if doPrint, fprintf(' done.\n'); end function subFields = convertSubsetOfFieldsToIndices(subFields, allFields) % convert subFields from various classes to a numeric index of allFields % may be input as: % - A string: 'id' % - A cellstr: {'id','groupNo'} % - A logical (or 0,1) mask the same size as fieldNames % - Indices into fieldNames of the key fields % keyFields will be transformed to the indices representation below if ischar(subFields) || iscellstr(subFields) subFields = ismember(upper(allFields), upper(subFields)); end if isnumeric(subFields) && (any(subFields==0) || nnz(subFields==1)>1) subFields = logical(subFields); end if islogical(subFields) subFields = find(subFields); end