用四边形优化一下dp即可。。。。


#include <bits/stdc++.h>
using namespace std;
typedef long long LL;

const int maxm = 808;
const int maxn = 4004;
const int INF = 0x3f3f3f3f;

int f[maxm][maxn];
int s[maxm][maxn];
int a[maxn][maxn];
int n, m;

void read(int &x)
{
	x = 0;
	char ch = getchar();
	while(ch == ' ' || ch == '\n') ch = getchar();
	while(ch != ' ' && ch != '\n') x = x * 10 + ch - '0', ch = getchar();
}

int calc(int i, int j)
{
	return a[j][j] - a[i-1][j] - a[j][i-1] + a[i-1][i-1];
}

void work()
{
	scanf("%d%d", &n, &m);
	for(int i = 1; i <= n; i++)
		for(int j = 1; j <= n; j++)
		 	read(a[i][j]);
	for(int i = 1; i <= n; i++)
		for(int j = 1; j <= n; j++)
			a[i][j] += a[i][j-1];
	for(int i = 1; i <= n; i++)
		for(int j = 1; j <= n; j++)
			a[i][j] += a[i-1][j];
	
	memset(f, INF, sizeof f);
	f[0][0] = 0;
	for(int i = 1; i <= m; i++)
		for(int j = n; j >= 1; j--) {
			for(int k = s[i-1][j]; k <= (j == n ? n : s[i][j+1]) && k < j; k++) {
				if(f[i][j] > f[i-1][k] + calc(k+1, j)) {
					f[i][j] = f[i-1][k] + calc(k+1, j);
					s[i][j] = k;
				}
			}
		}
	printf("%d\n", f[m][n] / 2);
}

int main()
{
//freopen("data", "r", stdin);
	work();
	
	return 0;
}